In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix,f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt 
  


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/"))

import seaborn as sns
import re
import matplotlib.pyplot as plt
import missingno as ms

import seaborn as sns
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from wordcloud import WordCloud, STOPWORDS

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('stopwords')

from sklearn import metrics

from sklearn.metrics import accuracy_score, classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression



import warnings
warnings.filterwarnings("ignore")

In [None]:
training_data = pd.read_csv('../input/twitter-hate-speech/train_E6oV3lV.csv') #to read and store in panda dataframe
testing_data = pd.read_csv('../input/twitter-hate-speech/test_tweets_anuFYb8.csv') #to read and store in panda dataframe

In [None]:
len(testing_data)

In [None]:
len(training_data)

# Data Processing

In [None]:
nltk.download('stopwords')
eng_stops = set(stopwords.words("english"))

In [None]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

In [None]:
def process_message(review_text):
    # remove all the special characters
    new_review_text = re.sub("[^a-zA-Z]", " ",review_text) 
    # convert all letters to lower case
    words = new_review_text.lower().split()
    # remove stop words
    words = [w for w in words if not w in eng_stops]    
    # lemmatizer
    words = [lemmatizer.lemmatize(word) for word in words]
    # join all words back to text
    return (" ".join(words))

In [None]:
training_data['clean_tweet']=training_data['tweet'].apply(lambda x: process_message(x))

# EDA

In [None]:
training_data.head()

In [None]:
training_data.shape

In [None]:
testing_data.shape

In [None]:
training_data.info()

In [None]:
training_data.label.value_counts()

Let's see the distribution of data

In [None]:
# Data balance
def createPieChartFor(t_df):
    Lst = 100*t_df.value_counts()/len(t_df)
    
    # set data for pie chart
    labels = t_df.value_counts().index.values
    sizes =  Lst 
    
    # set labels
    fig1, ax1 = plt.subplots()
    ax1.pie(sizes, labels=labels, autopct='%1.2f%%', shadow=True, startangle=90)
    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

In [None]:
createPieChartFor(training_data.label)

We can see that we have only 7% data availble classified as hate comment.
since data is imbalanced, we should explore data balancing techinques. First let's continue with current data 

In [None]:
training_data['length'] =  training_data['tweet'].apply(len)
fig1 = sns.barplot('label', 'length', data= training_data)
plt.title('Average word length vs label')


In [None]:
from wordcloud import WordCloud,STOPWORDS
def createWrdCloudForSentiment(sentiment):
    sentiment_num = 1 if sentiment== 'Hate' else 0
    temp_df = training_data[training_data.label==sentiment_num]
    words = " ".join(temp_df.clean_tweet)
    cleaned_words = " ".join([w for w in words.split()
                                  if 'http' not in w
                                    and not w.startswith('@')
                                    and w!='RT'])

    wrdcld = WordCloud(stopwords=STOPWORDS,
                      background_color='black',
                      width=1500,
                      height=1000).generate(cleaned_words)
    plt.figure(figsize=(10,10))
    plt.imshow(wrdcld)
    plt.axis('off')
    plt.show

In [None]:
createWrdCloudForSentiment('Hate')

we can see that word's common in hate comments are: trump, libtard, hate, white, black, racist

In [None]:
createWrdCloudForSentiment('Positive')

we can see that word's common in positive comments are: love, life, makepeople, today, happy

In [None]:
#from spellchecker import SpellChecker

#spell = SpellChecker()

## Make test-train split

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df =  train_test_split(training_data, test_size = 0.2, random_state = 42)

TF-IDF

In [None]:
vectorizer = TfidfVectorizer()
train_tfidf_model = vectorizer.fit_transform(train_df.clean_tweet)
test_tfidf_model = vectorizer.transform(test_df.clean_tweet)

In [None]:
train_tfidf_model

In [None]:
train_tfidf = pd.DataFrame(train_tfidf_model)
train_tfidf

## Model Building

In [None]:
cls = [LogisticRegression(),
       MultinomialNB(), 
       DecisionTreeClassifier(),
       RandomForestClassifier(n_estimators=200),
       KNeighborsClassifier(n_neighbors = 5)]

cls_name = []

In [None]:
test_tfidf_model

In [None]:
test_df.label.count()

In [None]:
lbl_actual = test_df.label
i = 0
accuracy = []
for cl in cls:
    model = cl.fit(train_tfidf_model,train_df.label)
    lbl_pred = model.predict(test_tfidf_model)
    a = (100*accuracy_score(lbl_pred, lbl_actual))
    a = round(a,2)
    accuracy.append(a)
    cls_name.append(cl.__class__.__name__)
    print ("{}  Accuracy Score : {}%".format(cls_name[i],a))
    print ( classification_report(lbl_pred, lbl_actual))
    i +=1

In [None]:
plt.bar(cls_name, accuracy)
plt.xticks(rotation=70)

In [None]:
# Predict accuracy
def getModelAccuracy_LogicalReg(model_name, sampled_train_df) :
    # bag of words model
    vectorizer = TfidfVectorizer()
    sampled_train_tfidf_model = vectorizer.fit_transform(sampled_train_df.clean_tweet)
    sampled_test_tfidf_model = vectorizer.transform(test_df.clean_tweet)


    # let's look at the dataframe
    #sampled_train_tfidf = pd.DataFrame(sampled_train_tfidf_model.toarray(), columns=vectorizer.get_feature_names())
    #sampled_train_tfidf

    # Predict
    sample_model = LogisticRegression().fit(sampled_train_tfidf_model,sampled_train_df.label)
    lg_lbl_pred = sample_model.predict(sampled_test_tfidf_model)
    a = (100*accuracy_score(lg_lbl_pred, lbl_actual))
    a = round(a,2)
    print ("{}  Accuracy Score : {}%".format(model_name,a))
    #print(type(a))
    return float(a)

In [None]:
# Predict accuracy using rfc
def getModelAccuracy_RFC(model_name, sampled_train_df) :

    # bag of words model
    vectorizer = TfidfVectorizer()
    sampled_train_tfidf_model = vectorizer.fit_transform(sampled_train_df.clean_tweet)
    sampled_test_tfidf_model = vectorizer.transform(test_df.clean_tweet)


    # let's look at the dataframe
    #sampled_train_tfidf = pd.DataFrame(sampled_train_tfidf_model.toarray(), columns=vectorizer.get_feature_names())
    #sampled_train_tfidf

    # Predict
    sample_model = RandomForestClassifier(n_estimators=200).fit(sampled_train_tfidf_model,sampled_train_df.label)
    lg_lbl_pred = sample_model.predict(sampled_test_tfidf_model)
    a = (100*accuracy_score(lg_lbl_pred, lbl_actual))
    a = round(a,2)
    print ("{}  Accuracy Score : {}%".format(model_name,a))
    return float(a)

In [None]:
log_accuracy = []
rfc_accuracy = []

In [None]:
a = getModelAccuracy_LogicalReg("Train dataset", train_df)
#print(a)
log_accuracy.append(a)

In [None]:
log_accuracy

In [None]:
a = getModelAccuracy_RFC("Train dataset", train_df)
rfc_accuracy.append(a)

In [None]:
rfc_accuracy

# Data Imbalance Handling

### Check the data imbalance

In [None]:
createPieChartFor(train_df.label)

In [None]:
print(train_df.label.value_counts())

### Undersampling

In [None]:
#As this dataset is highly imbalance we have to balance this by under sampling
count_hate = train_df[train_df['label'] == 1]['clean_tweet'].count()
df_non_hate_speech = train_df[train_df['label'] == 0]
df_hate_speech = train_df[train_df['label'] == 1]
df_hate_speech_undersample = df_non_hate_speech.sample(count_hate, replace=True)
train_df_undersampled = pd.concat([df_hate_speech, df_hate_speech_undersample], axis=0)

print('Random under-sampling:')
print(train_df_undersampled['label'].value_counts())

In [None]:
a = getModelAccuracy_LogicalReg("Under Sampling", train_df_undersampled)
log_accuracy.append(a)

### Oversampling

In [None]:
#As this dataset is highly imbalance we have to balance this by over sampling
count_non_hate = train_df[train_df['label'] == 0]['clean_tweet'].count()
df_hate_speech = train_df[train_df['label'] == 1]
df_non_hate_speech = train_df[train_df['label'] == 0]
df_hate_speech_oversample = df_hate_speech.sample(count_non_hate, replace=True)
train_df_oversampled = pd.concat([df_non_hate_speech, df_hate_speech_oversample], axis=0)

print('Random over-sampling:')
print(train_df_oversampled['label'].value_counts())

In [None]:
a = getModelAccuracy_LogicalReg("Over Sampling", train_df_oversampled)
log_accuracy.append(a)

In [None]:
a= getModelAccuracy_RFC("Under Sampling", train_df_undersampled)
rfc_accuracy.append(a)

In [None]:
a = getModelAccuracy_RFC("Over Sampling", train_df_oversampled)
rfc_accuracy.append(a)

In [None]:
rfc_accuracy

### Model selection

In [None]:

X = ['Train Dataset','Under Sampled','Over Sampled']

  
X_axis = np.arange(len(X))
  
plt.bar(X_axis - 0.2, log_accuracy, 0.4, label = 'Log')
plt.bar(X_axis + 0.2, rfc_accuracy, 0.4, label = 'RFC')
  
plt.xticks(X_axis, X)
plt.xlabel("Model")
plt.ylabel("Accuracy")
plt.title("Accuracy vs model and sampling")
plt.legend()
plt.show()

**We can see that there hasn't been much improvement with over sampling and under sampling, so we can go with trained data**



# Output

In [None]:
training_data

In [None]:
train_df

In [None]:
testing_data

In [None]:
testing_data['clean_tweet']=testing_data['tweet'].apply(lambda x: process_message(x))

In [None]:
# bag of words model
vectorizer = TfidfVectorizer()
sampled_train_tfidf_model = vectorizer.fit_transform(train_df.clean_tweet)
sampled_test_tfidf_model = vectorizer.transform(testing_data.clean_tweet)


# Predict
sample_model = LogisticRegression().fit(sampled_train_tfidf_model,train_df.label)
lg_lbl_pred = sample_model.predict(sampled_test_tfidf_model)

In [None]:
lg_lbl_pred_df = pd.DataFrame({'id': testing_data.id,
                            'tweet' : testing_data.tweet,
                            'label' : lg_lbl_pred})
lg_lbl_pred_df.head()

In [None]:
lg_lbl_pred_df.label.value_counts()

In [None]:
lg_lbl_pred_df.to_csv('hate_speech_output.csv', index=False)

# Conclusion

Logistic regression gives good accuracy on the current data set. 
