## Twitter Hate Speech Combat Project

#### Import Library

In [1]:
import pandas as pd
import warnings

import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
nltk.download('punkt')
nltk.download('brown')
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
from nltk.tokenize import TweetTokenizer 

from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
#import different metrics to evaluate the classifiers
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 
from sklearn import metrics

#matplotlib imports are used to plot confusion matrices for the classifiers
import matplotlib as mpl 
import matplotlib.cm as cm 
import matplotlib.pyplot as plt 

warnings.filterwarnings('ignore')
%autosave 120
pd.set_option('display.max_colwidth', 900)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\win10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Autosaving every 120 seconds


#### 	Load the tweets file using read_csv function from Pandas package. 

In [3]:
tweets = pd.read_csv('C:\\rohit\\simple\\artificial engineer\\nlp\\assessment\\TwitterHate.csv')
#tweets = pd.read_csv('TwitterHate.csv')
tweets.head(10)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo
6,7,0,@user camping tomorrow @user @user @user @user @user @user @user dannyâ¦
7,8,0,the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams #hate #imagine #actorslife #revolutionschool #girl
8,9,0,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦
9,10,0,@user @user welcome here ! i'm it's so #gr8 !


#### Tweets Cleanup

In [4]:
HANDLE = '@\w+'
LINK = 'https?://t\.co/\w+'
SPECIAL_CHARS = '&lt;|&lt;|&amp;|#;|±!!!'
def clean(text):
    text = re.sub(HANDLE, ' ', text)
    text = re.sub(LINK, ' ', text)
    text = re.sub(SPECIAL_CHARS, ' ', text)
    text = text.lower()
    return text

tweets['tweet_clean'] = tweets.tweet.apply(clean)
tweets.head(10)

Unnamed: 0,id,label,tweet,tweet_clean
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,#model i love u take with u all the time in urð ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo
6,7,0,@user camping tomorrow @user @user @user @user @user @user @user dannyâ¦,camping tomorrow dannyâ¦
7,8,0,the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams #hate #imagine #actorslife #revolutionschool #girl,the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams #hate #imagine #actorslife #revolutionschool #girl
8,9,0,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦
9,10,0,@user @user welcome here ! i'm it's so #gr8 !,welcome here ! i'm it's so #gr8 !


#### Tweets Tokenize

In [5]:
# Create a reference variable for Class TweetTokenizer 
tk = TweetTokenizer() 
  
# Create a string input 

def tweettokenise(text):
      text = tk.tokenize(text) 
      return text

tweets['tweet_token'] = tweets.tweet_clean.apply(tweettokenise)
tweets.head(10)
  


Unnamed: 0,id,label,tweet,tweet_clean,tweet_token
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,"[when, a, father, is, dysfunctional, and, is, so, selfish, he, drags, his, kids, into, his, dysfunction, ., #run]"
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,"[thanks, for, #lyft, credit, i, can't, use, cause, they, don't, offer, wheelchair, vans, in, pdx, ., #disapointed, #getthanked]"
2,3,0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]"
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,#model i love u take with u all the time in urð ððððð¦ð¦ð¦,"[#model, i, love, u, take, with, u, all, the, time, in, urð, ð, , , , ð, , , , ð, , , , ð, , , ð, , , ¦, ð, , , ¦, ð, , , ¦]"
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation,"[factsguide, :, society, now, #motivation]"
5,6,0,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo,"[[, 2/2, ], huge, fan, fare, and, big, talking, before, they, leave, ., chaos, and, pay, disputes, when, they, get, there, ., #allshowandnogo]"
6,7,0,@user camping tomorrow @user @user @user @user @user @user @user dannyâ¦,camping tomorrow dannyâ¦,"[camping, tomorrow, dannyâ, , ¦]"
7,8,0,the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams #hate #imagine #actorslife #revolutionschool #girl,the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams #hate #imagine #actorslife #revolutionschool #girl,"[the, next, school, year, is, the, year, for, exams, ., ð, , , ¯, can't, think, about, that, ð, , , ­, #school, #exams, #hate, #imagine, #actorslife, #revolutionschool, #girl]"
8,9,0,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦,"[we, won, !, !, !, love, the, land, !, !, !, #allin, #cavs, #champions, #cleveland, #clevelandcavaliers, â, , ¦]"
9,10,0,@user @user welcome here ! i'm it's so #gr8 !,welcome here ! i'm it's so #gr8 !,"[welcome, here, !, i'm, it's, so, #gr8, !]"


#### Remove Stopwords

In [6]:
stop_words = set(stopwords.words("english")) 
tweets['stop']=tweets['tweet_token'].apply(lambda x: [item for item in x if item not in stop_words])
tweets.head(10)


Unnamed: 0,id,label,tweet,tweet_clean,tweet_token,stop
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,"[when, a, father, is, dysfunctional, and, is, so, selfish, he, drags, his, kids, into, his, dysfunction, ., #run]","[father, dysfunctional, selfish, drags, kids, dysfunction, ., #run]"
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,"[thanks, for, #lyft, credit, i, can't, use, cause, they, don't, offer, wheelchair, vans, in, pdx, ., #disapointed, #getthanked]","[thanks, #lyft, credit, can't, use, cause, offer, wheelchair, vans, pdx, ., #disapointed, #getthanked]"
2,3,0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]"
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,#model i love u take with u all the time in urð ððððð¦ð¦ð¦,"[#model, i, love, u, take, with, u, all, the, time, in, urð, ð, , , , ð, , , , ð, , , , ð, , , ð, , , ¦, ð, , , ¦, ð, , , ¦]","[#model, love, u, take, u, time, urð, ð, , , , ð, , , , ð, , , , ð, , , ð, , , ¦, ð, , , ¦, ð, , , ¦]"
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation,"[factsguide, :, society, now, #motivation]","[factsguide, :, society, #motivation]"
5,6,0,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo,"[[, 2/2, ], huge, fan, fare, and, big, talking, before, they, leave, ., chaos, and, pay, disputes, when, they, get, there, ., #allshowandnogo]","[[, 2/2, ], huge, fan, fare, big, talking, leave, ., chaos, pay, disputes, get, ., #allshowandnogo]"
6,7,0,@user camping tomorrow @user @user @user @user @user @user @user dannyâ¦,camping tomorrow dannyâ¦,"[camping, tomorrow, dannyâ, , ¦]","[camping, tomorrow, dannyâ, , ¦]"
7,8,0,the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams #hate #imagine #actorslife #revolutionschool #girl,the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams #hate #imagine #actorslife #revolutionschool #girl,"[the, next, school, year, is, the, year, for, exams, ., ð, , , ¯, can't, think, about, that, ð, , , ­, #school, #exams, #hate, #imagine, #actorslife, #revolutionschool, #girl]","[next, school, year, year, exams, ., ð, , , ¯, can't, think, ð, , , ­, #school, #exams, #hate, #imagine, #actorslife, #revolutionschool, #girl]"
8,9,0,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦,"[we, won, !, !, !, love, the, land, !, !, !, #allin, #cavs, #champions, #cleveland, #clevelandcavaliers, â, , ¦]","[!, !, !, love, land, !, !, !, #allin, #cavs, #champions, #cleveland, #clevelandcavaliers, â, , ¦]"
9,10,0,@user @user welcome here ! i'm it's so #gr8 !,welcome here ! i'm it's so #gr8 !,"[welcome, here, !, i'm, it's, so, #gr8, !]","[welcome, !, i'm, #gr8, !]"


#### Get all the tokenized terms into one large list.

In [7]:
corpus = [ ]
def tweettokenise(text):
      text = ' '.join(text)
      corpus.append(text)
      return text

tweets['tweet_final'] = tweets.stop.apply(tweettokenise)
tweets.head(10)
  
C = Counter(corpus).most_common(10)
#len(corpus)
#print(corpus)
print(C)


[('#model love u take u time urð ð \x9f \x98 \x99 ð \x9f \x98 \x8e ð \x9f \x91 \x84 ð \x9f \x91 ð \x9f \x92 ¦ ð \x9f \x92 ¦ ð \x9f \x92 ¦', 325), ('finally found way delete old tweets ! might find useful well : #deletetweets', 83), ('aww yeah good bing bong bing bong', 75), ('might libtard ... #libtard #sjw #liberal #politics', 72), ("i'm #grateful - #affirmations", 57), ('#lighttherapy help #depression ? #altwaystoheal #healthy #happy ! !', 31), ('ð \x9f \x94 \x9d #love #instagood #photooftheday top.tags #tbt #cute #me #beautiful #followme #followâ \x80 ¦', 31), ('lover stop angry visit us .. > > > #lover #friend #astrologer #love', 26), ('best #essentialoils #anxiety ! ! #healthy #peace ! ! #altwaystoheal ! !', 26), ('#sikh #temple vandalised #calgary , #wso condemns act', 26)]


#### Print 10 most common terms

In [8]:
C = pd.DataFrame(C,columns=['words', 'count'])

C.head(10)

Unnamed: 0,words,count
0,#model love u take u time urð ð    ð    ð    ð   ð   ¦ ð   ¦ ð   ¦,325
1,finally found way delete old tweets ! might find useful well : #deletetweets,83
2,aww yeah good bing bong bing bong,75
3,might libtard ... #libtard #sjw #liberal #politics,72
4,i'm #grateful - #affirmations,57
5,#lighttherapy help #depression ? #altwaystoheal #healthy #happy ! !,31
6,ð    #love #instagood #photooftheday top.tags #tbt #cute #me #beautiful #followme #followâ  ¦,31
7,lover stop angry visit us .. > > > #lover #friend #astrologer #love,26
8,best #essentialoils #anxiety ! ! #healthy #peace ! ! #altwaystoheal ! !,26
9,"#sikh #temple vandalised #calgary , #wso condemns act",26


#### Assign X and y.

In [9]:
X = tweets.tweet_final #drop(['label','id'], axis = 1)
X.head()

0                                  father dysfunctional selfish drags kids dysfunction . #run
1    thanks #lyft credit can't use cause offer wheelchair vans pdx . #disapointed #getthanked
2                                                                              bihday majesty
3         #model love u take u time urð ð    ð    ð    ð   ð   ¦ ð   ¦ ð   ¦
4                                                            factsguide : society #motivation
Name: tweet_final, dtype: object

In [10]:
Y=tweets.label
Y.head(10)

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: label, dtype: int64

#### Train Test Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.1)

In [12]:
X_train.shape

(28765,)

In [13]:
X_test.shape

(3197,)

In [14]:
y_train.shape

(28765,)

In [15]:
y_test.shape

(3197,)

#### Use TF-IDF values for the terms as a feature to get into a vector space model

In [16]:
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_model = vectorizer.fit_transform(X_train)
tfidf_test = vectorizer.transform(X_test)
print(tfidf_model.todense())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [17]:
tfidf_df = pd.DataFrame(tfidf_model.todense())

tfidf_df.columns = sorted(vectorizer.vocabulary_)

tfidf_df.head(10)

Unnamed: 0,00,000,01,039,05,06,08,10,100,1000,...,½ï,½ð,¾â,¾ï,¾ð,à¹,ä¹,ï¼,ï½,ó¾
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
tfidf_df.shape

(28765, 5000)

#### Model building: Ordinary Logistic Regression

In [19]:
logreg = LogisticRegression()
#logreg.fit(tfidf_train,y_train)
logreg.fit(tfidf_model,y_train)
predicted_labels = logreg.predict(tfidf_test)

#### Model evaluation: Accuracy, recall, and f_1 score.

In [20]:
print("Accuracy: ", accuracy_score(y_test, predicted_labels))

Accuracy:  0.9471379418204566


In [21]:
from sklearn.metrics import recall_score
recall = recall_score(y_test, predicted_labels, average='binary')
print('Recall: %.3f' % recall)

Recall: 0.354


In [22]:
from sklearn.metrics import f1_score
score = f1_score(y_test, predicted_labels, average='binary')
print('F1 score: %.3f' % score)

F1 score: 0.504


#### Adjusting for Class Imbalances

In [23]:
logreg = LogisticRegression(class_weight="balanced")
logreg.fit(tfidf_model, y_train)
predicted_labels = logreg.predict(tfidf_test)

In [24]:
print("Accuracy: ", accuracy_score(y_test, predicted_labels))

Accuracy:  0.9264935877385049


In [25]:
recall = recall_score(y_test, predicted_labels, average='binary')
print('Recall: %.3f' % recall)

Recall: 0.819


In [26]:
score = f1_score(y_test, predicted_labels, average='binary')
print('F1 score: %.3f' % score)

F1 score: 0.629


In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
import numpy as np
classifier_lr = LogisticRegression(class_weight="balanced")
param_grid = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
    
grid_search = GridSearchCV(classifier_lr, param_grid = param_grid,cv = 4) #, n_jobs = -1, verbose = 1) 
#, scoring = "recall" ),cv = StratifiedKFold(4)
grid_search.fit(tfidf_model, y_train)
grid_search.best_estimator_
y_test_pred = grid_search.best_estimator_.predict(tfidf_test)
print(classification_report(y_test, y_test_pred))


              precision    recall  f1-score   support

           0       0.98      0.95      0.97      2954
           1       0.56      0.74      0.64       243

    accuracy                           0.94      3197
   macro avg       0.77      0.84      0.80      3197
weighted avg       0.95      0.94      0.94      3197

