# Analyzing Hate Speech on Twitter

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import style
style.use('fivethirtyeight')
from sklearn import linear_model, datasets, model_selection 
from sklearn.cross_validation import cross_val_score
import re

In [64]:
#Read CSV
text = pd.read_csv('twitter.csv', encoding = "ISO-8859-1")
#Default UTF-8 encoding was returning an error, ISO is fine to use because dataset is in English
text.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,does_this_tweet_contain_hate_speech,does_this_tweet_contain_hate_speech:confidence,_created_at,orig__golden,orig__last_judgment_at,orig__trusted_judgments,orig__unit_id,orig__unit_state,_updated_at,orig_does_this_tweet_contain_hate_speech,does_this_tweet_contain_hate_speech_gold,does_this_tweet_contain_hate_speech_gold_reason,does_this_tweet_contain_hate_speechconfidence,tweet_id,tweet_text
0,853718217,True,golden,86,,The tweet uses offensive language but not hate...,0.6013,,True,,0.0,615561535.0,golden,,The tweet contains hate speech,The tweet contains hate speech\nThe tweet uses...,,1.0,1666196000.0,Warning: penny boards will make you a faggot
1,853718218,True,golden,92,,The tweet contains hate speech,0.7227,,True,,0.0,615561723.0,golden,,The tweet contains hate speech,The tweet contains hate speech\nThe tweet uses...,,1.0,429512100.0,Fuck dykes
2,853718219,True,golden,86,,The tweet contains hate speech,0.5229,,True,,0.0,615562039.0,golden,,The tweet contains hate speech,The tweet contains hate speech\nThe tweet uses...,,1.0,395623800.0,@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...
3,853718220,True,golden,98,,The tweet contains hate speech,0.5184,,True,,0.0,615562068.0,golden,,The tweet contains hate speech,The tweet contains hate speech\nThe tweet uses...,,1.0,497514700.0,"""@jayswaggkillah: ""@JacklynAnnn: @jayswaggkill..."
4,853718221,True,golden,88,,The tweet uses offensive language but not hate...,0.5185,,True,,0.0,615562488.0,golden,,The tweet contains hate speech,The tweet contains hate speech\nThe tweet uses...,,1.0,588923600.0,@Zhugstubble You heard me bitch but any way I'...


In [65]:
#Selecting Relevant Columns
columns_of_interest = ['does_this_tweet_contain_hate_speech','does_this_tweet_contain_hate_speech:confidence', 'tweet_text']
text = text[columns_of_interest]
text.head()

Unnamed: 0,does_this_tweet_contain_hate_speech,does_this_tweet_contain_hate_speech:confidence,tweet_text
0,The tweet uses offensive language but not hate...,0.6013,Warning: penny boards will make you a faggot
1,The tweet contains hate speech,0.7227,Fuck dykes
2,The tweet contains hate speech,0.5229,@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...
3,The tweet contains hate speech,0.5184,"""@jayswaggkillah: ""@JacklynAnnn: @jayswaggkill..."
4,The tweet uses offensive language but not hate...,0.5185,@Zhugstubble You heard me bitch but any way I'...


# Using Sklearn TFIDVectorizer to process text

In [66]:
#Cleaning
def clean(row):
    cleaned = ' '.join(re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|([x][0-9]+)|([0-9]+)'," ",row).split())
    return cleaned
text['tweet_text'] = text['tweet_text'].apply(clean)

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [68]:
#Initialize Vectorizer
vect = TfidfVectorizer()
#vect = CountVectorizer() (0.84785005512679157 Logreg score vs 0.85005512679162076)
vect = TfidfVectorizer(ngram_range=(1, 2)) 
#intuition being that bi-gram can distinguish hate speech from offensive language, however there is tradeoff of adding more noise in hopes that it will help signal
#If using bigram change min df to 2, have at least twice
#Can tune stop-words, ngrams, max_df, min_df

In [69]:
regex_match = '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|([x][0-9]+)|([0-9]+)'
#gets rid of Twitter handles, punctuation, urls, 'x89s etc., and all numbers

In [70]:
x = str(list(text["tweet_text"]))

In [71]:
#corpus of all words in training set
corpus = (' '.join(re.sub(regex_match," ",x).split()))

In [72]:
#Filter out stopwords
from nltk.corpus import stopwords
filtered_words = [word for word in corpus.split() if word not in stopwords.words('english')]

In [73]:
print(len(corpus))
print(len(filtered_words))

1022551
124865


In [74]:
filtered_words[:5]



In [75]:
#Fit Vectorizer
# vect.fit(corpus.split())
vect.fit(filtered_words)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [76]:
#Looking at feature names
vect.get_feature_names()[:20]

['aa',
 'aaa',
 'aaaw',
 'aah',
 'aahah',
 'aaliyah',
 'aap',
 'aapadoptsrapists',
 'aapl',
 'aaron',
 'aarp',
 'aay',
 'aays',
 'ab',
 'aba',
 'abandoned',
 'abandoning',
 'abaytownies',
 'abbott',
 'abby']

In [77]:
#encode
text['does_this_tweet_contain_hate_speech'] = text.does_this_tweet_contain_hate_speech.map({'The tweet uses offensive language but not hate speech':0, 'The tweet is not offensive':0, 'The tweet contains hate speech':1})

In [78]:
text.head()

Unnamed: 0,does_this_tweet_contain_hate_speech,does_this_tweet_contain_hate_speech:confidence,tweet_text
0,0,0.6013,Warning penny boards will make you a faggot
1,1,0.7227,Fuck dykes
2,1,0.5229,chulo at least i dont look like jefree starr f...
3,1,0.5184,Is a fag jackie jealous Neeeee
4,0,0.5185,You heard me bitch but any way I m back th tex...


In [79]:
# define X and y
X = text.tweet_text
y = text.does_this_tweet_contain_hate_speech
print(X.shape)
print(y.shape)

(14509,)
(14509,)


In [80]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(10881,)
(3628,)
(10881,)
(3628,)


In [81]:
# learn training data vocabulary, then used it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [82]:
# examine the document-term matrix
X_train_dtm

<10881x82540 sparse matrix of type '<class 'numpy.float64'>'
	with 257698 stored elements in Compressed Sparse Row format>

In [83]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<3628x82540 sparse matrix of type '<class 'numpy.float64'>'
	with 65176 stored elements in Compressed Sparse Row format>

# Using Logistic Regression

In [84]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=10)

In [85]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

CPU times: user 185 ms, sys: 9.95 ms, total: 195 ms
Wall time: 202 ms


LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [86]:
y_train.shape

(10881,)

In [87]:
X_train_dtm.shape

(10881, 82540)

In [88]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [89]:
# calculate accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.83517089305402425

In [90]:
#Find best parameters
from sklearn.grid_search import GridSearchCV
C_test = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
reg_test = ['l1', 'l2']
param_grid = dict(C=C_test, penalty= reg_test)
grid =GridSearchCV(logreg, param_grid, cv =10, scoring = 'accuracy')
grid.fit(X_train_dtm, y_train)
grid.grid_scores_

[mean: 0.83669, std: 0.00032, params: {'C': 1e-05, 'penalty': 'l1'},
 mean: 0.83669, std: 0.00032, params: {'C': 1e-05, 'penalty': 'l2'},
 mean: 0.83669, std: 0.00032, params: {'C': 0.0001, 'penalty': 'l1'},
 mean: 0.83669, std: 0.00032, params: {'C': 0.0001, 'penalty': 'l2'},
 mean: 0.83669, std: 0.00032, params: {'C': 0.001, 'penalty': 'l1'},
 mean: 0.83669, std: 0.00032, params: {'C': 0.001, 'penalty': 'l2'},
 mean: 0.83669, std: 0.00032, params: {'C': 0.01, 'penalty': 'l1'},
 mean: 0.83669, std: 0.00032, params: {'C': 0.01, 'penalty': 'l2'},
 mean: 0.84055, std: 0.00485, params: {'C': 0.1, 'penalty': 'l1'},
 mean: 0.83687, std: 0.00055, params: {'C': 0.1, 'penalty': 'l2'},
 mean: 0.84919, std: 0.00381, params: {'C': 1, 'penalty': 'l1'},
 mean: 0.84689, std: 0.00318, params: {'C': 1, 'penalty': 'l2'},
 mean: 0.84542, std: 0.00834, params: {'C': 10, 'penalty': 'l1'},
 mean: 0.85020, std: 0.00812, params: {'C': 10, 'penalty': 'l2'},
 mean: 0.83917, std: 0.00905, params: {'C': 100, 'pe

# Using KNN

In [91]:
from sklearn import neighbors

In [92]:
knn = neighbors.KNeighborsClassifier()

In [93]:
%time knn.fit(X_train_dtm, y_train)

CPU times: user 2.85 ms, sys: 1.45 ms, total: 4.3 ms
Wall time: 3.38 ms


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [94]:
y_pred_class = knn.predict(X_test_dtm)

In [95]:
metrics.accuracy_score(y_test, y_pred_class)

0.81835722160970237

In [96]:
n_neighbors_test = [1, 5, 10, 15, 50, 60, 70, 80 ,90]
param_grid = dict(n_neighbors=n_neighbors_test)
grid =GridSearchCV(knn, param_grid, cv =10, scoring = 'accuracy')
grid.fit(X_train_dtm, y_train)
grid.grid_scores_

[mean: 0.78761, std: 0.01273, params: {'n_neighbors': 1},
 mean: 0.82667, std: 0.00761, params: {'n_neighbors': 5},
 mean: 0.83779, std: 0.00781, params: {'n_neighbors': 10},
 mean: 0.84055, std: 0.00431, params: {'n_neighbors': 15},
 mean: 0.84321, std: 0.00542, params: {'n_neighbors': 50},
 mean: 0.84459, std: 0.00474, params: {'n_neighbors': 60},
 mean: 0.84404, std: 0.00518, params: {'n_neighbors': 70},
 mean: 0.84312, std: 0.00502, params: {'n_neighbors': 80},
 mean: 0.84220, std: 0.00604, params: {'n_neighbors': 90}]

# Using Keras

In [108]:
from keras.models import Sequential
twitter_model = Sequential()

In [109]:
from keras.layers import Dense
twitter_model.add(Dense(16, input_shape=(X_train_dtm.shape[1],)))

In [110]:
from keras.layers import Activation
twitter_model.add(Activation('sigmoid'))
twitter_model.add(Dense(14))

In [111]:
twitter_model.add(Dense(1))

In [112]:
twitter_model.add(Activation('sigmoid'))

In [113]:
twitter_model.compile(optimizer='Adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [114]:
twitter_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_21 (Dense)             (None, 16)                1320656   
_________________________________________________________________
activation_18 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_22 (Dense)             (None, 14)                238       
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 15        
_________________________________________________________________
activation_19 (Activation)   (None, 1)                 0         
Total params: 1,320,909
Trainable params: 1,320,909
Non-trainable params: 0
_________________________________________________________________


In [115]:
# Changing from sparse matrix to dense
A= X_train_dtm
A = A.todense()

In [116]:
twitter_model.fit(A, y_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x10ded8320>

In [117]:
# Changing from sparse matrix to dense
B=X_test_dtm
B = B.todense()

In [118]:
#Evaluating
twitter_model.evaluate(B, y_test)



[0.35900310161621068, 0.84343991186284961]