# Analyzing Hate Speech on Twitter

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import style
style.use('fivethirtyeight')
from sklearn import linear_model, datasets, model_selection 
from sklearn.cross_validation import cross_val_score
import re



In [2]:
#Read CSV
text = pd.read_csv('twitter.csv', encoding = "ISO-8859-1")
#Default UTF-8 encoding was returning an error, ISO is fine to use because dataset is in English
text.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,does_this_tweet_contain_hate_speech,does_this_tweet_contain_hate_speech:confidence,_created_at,orig__golden,orig__last_judgment_at,orig__trusted_judgments,orig__unit_id,orig__unit_state,_updated_at,orig_does_this_tweet_contain_hate_speech,does_this_tweet_contain_hate_speech_gold,does_this_tweet_contain_hate_speech_gold_reason,does_this_tweet_contain_hate_speechconfidence,tweet_id,tweet_text
0,853718217,True,golden,86,,The tweet uses offensive language but not hate...,0.6013,,True,,0.0,615561535.0,golden,,The tweet contains hate speech,The tweet contains hate speech\nThe tweet uses...,,1.0,1666196000.0,Warning: penny boards will make you a faggot
1,853718218,True,golden,92,,The tweet contains hate speech,0.7227,,True,,0.0,615561723.0,golden,,The tweet contains hate speech,The tweet contains hate speech\nThe tweet uses...,,1.0,429512100.0,Fuck dykes
2,853718219,True,golden,86,,The tweet contains hate speech,0.5229,,True,,0.0,615562039.0,golden,,The tweet contains hate speech,The tweet contains hate speech\nThe tweet uses...,,1.0,395623800.0,@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...
3,853718220,True,golden,98,,The tweet contains hate speech,0.5184,,True,,0.0,615562068.0,golden,,The tweet contains hate speech,The tweet contains hate speech\nThe tweet uses...,,1.0,497514700.0,"""@jayswaggkillah: ""@JacklynAnnn: @jayswaggkill..."
4,853718221,True,golden,88,,The tweet uses offensive language but not hate...,0.5185,,True,,0.0,615562488.0,golden,,The tweet contains hate speech,The tweet contains hate speech\nThe tweet uses...,,1.0,588923600.0,@Zhugstubble You heard me bitch but any way I'...


In [3]:
#Selecting Relevant Columns
columns_of_interest = ['does_this_tweet_contain_hate_speech','does_this_tweet_contain_hate_speech:confidence', 'tweet_text']
text = text[columns_of_interest]
text

Unnamed: 0,does_this_tweet_contain_hate_speech,does_this_tweet_contain_hate_speech:confidence,tweet_text
0,The tweet uses offensive language but not hate...,0.6013,Warning: penny boards will make you a faggot
1,The tweet contains hate speech,0.7227,Fuck dykes
2,The tweet contains hate speech,0.5229,@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...
3,The tweet contains hate speech,0.5184,"""@jayswaggkillah: ""@JacklynAnnn: @jayswaggkill..."
4,The tweet uses offensive language but not hate...,0.5185,@Zhugstubble You heard me bitch but any way I'...
5,The tweet contains hate speech,0.8816,@elaynay your a dirty terrorist and your relig...
6,The tweet contains hate speech,0.5207,RT @ivanrabago_: @_WhitePonyJr_ looking like f...
7,The tweet contains hate speech,0.5619,Well I thought you knew actually RT @KingHorse...
8,The tweet uses offensive language but not hate...,0.6419,"@Stonisnipezz I know. It was a joke, faggot."
9,The tweet uses offensive language but not hate...,0.6407,I'm tired of people saying I look like my brot...


# Using Sklearn TFIDVectorizer to process text

In [4]:
#Cleaning
def clean(row):
    cleaned = ' '.join(re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|([x][0-9]+)|([0-9]+)'," ",row).split())
    return cleaned
text['tweet_text'] = text['tweet_text'].apply(clean)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
#Initialize Vectorizer
vect = TfidfVectorizer()
#vect = CountVectorizer() (0.84785005512679157 Logreg score vs 0.85005512679162076)
vect = TfidfVectorizer(ngram_range=(1, 2)) 
#intuition being that bi-gram can distinguish hate speech from offensive language, however there is tradeoff of adding more noise in hopes that it will help signal
#If using bigram change min df to 2, have at least twice
#Can tune stop-words, ngrams, max_df, min_df

In [7]:
regex_match = '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|([x][0-9]+)|([0-9]+)'
#gets rid of Twitter handles, punctuation, urls, 'x89s etc., and all numbers

In [8]:
x = str(list(text["tweet_text"]))

In [9]:
corpus = (' '.join(re.sub(regex_match," ",x).split()))

In [10]:
#Fit Vectorizer
vect.fit(corpus.split())

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
#Looking at feature names
vect.get_feature_names()

['aa',
 'aaa',
 'aaaw',
 'aah',
 'aahah',
 'aaliyah',
 'aap',
 'aapadoptsrapists',
 'aapl',
 'aaron',
 'aarp',
 'aay',
 'aays',
 'ab',
 'aba',
 'abandoned',
 'abandoning',
 'abaytownies',
 'abbott',
 'abby',
 'abc',
 'abdomens',
 'abdullah',
 'abeg',
 'abervet',
 'abhorrent',
 'abiding',
 'abilities',
 'ability',
 'abilityfto',
 'abimbola',
 'abject',
 'ablandoo',
 'ablaze',
 'able',
 'ableg',
 'ablht',
 'abm',
 'abndp',
 'abnormal',
 'abolished',
 'abomination',
 'aboot',
 'aborting',
 'abortion',
 'about',
 'above',
 'abpoli',
 'abracadabra',
 'abrahamic',
 'abroad',
 'abs',
 'absent',
 'absolute',
 'absolutely',
 'absolutionist',
 'absorbed',
 'absoultely',
 'abstinent',
 'abstract',
 'absurd',
 'abt',
 'abuela',
 'abuelas',
 'abuse',
 'abused',
 'abuser',
 'abusing',
 'abusive',
 'aby',
 'abyad',
 'ac',
 'acab',
 'academic',
 'academics',
 'acc',
 'accent',
 'accents',
 'accept',
 'acceptable',
 'acceptance',
 'acceptancebfor',
 'accepted',
 'accepts',
 'accessibility',
 'accessibl

In [12]:
#encode
text['does_this_tweet_contain_hate_speech'] = text.does_this_tweet_contain_hate_speech.map({'The tweet uses offensive language but not hate speech':0, 'The tweet is not offensive':0, 'The tweet contains hate speech':1})

In [13]:
text

Unnamed: 0,does_this_tweet_contain_hate_speech,does_this_tweet_contain_hate_speech:confidence,tweet_text
0,0,0.6013,Warning penny boards will make you a faggot
1,1,0.7227,Fuck dykes
2,1,0.5229,chulo at least i dont look like jefree starr f...
3,1,0.5184,Is a fag jackie jealous Neeeee
4,0,0.5185,You heard me bitch but any way I m back th tex...
5,1,0.8816,your a dirty terrorist and your religion is a ...
6,1,0.5207,RT WhitePonyJr looking like faggots
7,1,0.5619,Well I thought you knew actually RT Man why y ...
8,0,0.6419,I know It was a joke faggot
9,0,0.6407,I m tired of people saying I look like my brot...


In [14]:
# define X and y
X = text.tweet_text
y = text.does_this_tweet_contain_hate_speech
print(X.shape)
print(y.shape)

(14509,)
(14509,)


In [15]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(10881,)
(3628,)
(10881,)
(3628,)


In [16]:
# learn training data vocabulary, then used it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [17]:
# examine the document-term matrix
X_train_dtm

<10881x82394 sparse matrix of type '<class 'numpy.float64'>'
	with 257532 stored elements in Compressed Sparse Row format>

In [18]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<3628x82394 sparse matrix of type '<class 'numpy.float64'>'
	with 65214 stored elements in Compressed Sparse Row format>

# Using Logistic Regression

In [19]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=10)

In [20]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

CPU times: user 166 ms, sys: 7.43 ms, total: 173 ms
Wall time: 192 ms


LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
y_train.shape

(10881,)

In [22]:
X_train_dtm.shape

(10881, 82394)

In [23]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [24]:
# calculate accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.85033076074972436

In [25]:
#Find best parameters
from sklearn.grid_search import GridSearchCV
C_test = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = dict(C=C_test)
grid =GridSearchCV(logreg, param_grid, cv =10, scoring = 'accuracy')
grid.fit(X_train_dtm, y_train)
grid.grid_scores_



[mean: 0.83715, std: 0.00030, params: {'C': 1e-05},
 mean: 0.83715, std: 0.00030, params: {'C': 0.0001},
 mean: 0.83715, std: 0.00030, params: {'C': 0.001},
 mean: 0.83715, std: 0.00030, params: {'C': 0.01},
 mean: 0.83733, std: 0.00079, params: {'C': 0.1},
 mean: 0.84533, std: 0.00520, params: {'C': 1},
 mean: 0.84459, std: 0.00700, params: {'C': 10},
 mean: 0.84156, std: 0.00634, params: {'C': 100},
 mean: 0.84055, std: 0.00659, params: {'C': 1000}]

In [26]:
null_score = 12110/14509
null_score

0.8346543524708802

# Using KNN

In [27]:
from sklearn import neighbors

In [28]:
knn = neighbors.KNeighborsClassifier()

In [29]:
%time knn.fit(X_train_dtm, y_train)

CPU times: user 2.31 ms, sys: 440 µs, total: 2.75 ms
Wall time: 2.56 ms


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [30]:
y_pred_class = knn.predict(X_test_dtm)

In [31]:
metrics.accuracy_score(y_test, y_pred_class)

0.827728776185226

In [32]:
n_neighbors_test = [1, 5, 10, 15, 50, 60, 70, 80 ,90]
param_grid = dict(n_neighbors=n_neighbors_test)
grid =GridSearchCV(knn, param_grid, cv =10, scoring = 'accuracy')
grid.fit(X_train_dtm, y_train)
grid.grid_scores_

[mean: 0.78421, std: 0.01553, params: {'n_neighbors': 1},
 mean: 0.82198, std: 0.01114, params: {'n_neighbors': 5},
 mean: 0.83623, std: 0.00651, params: {'n_neighbors': 10},
 mean: 0.83485, std: 0.00607, params: {'n_neighbors': 15},
 mean: 0.84321, std: 0.00429, params: {'n_neighbors': 50},
 mean: 0.84183, std: 0.00471, params: {'n_neighbors': 60},
 mean: 0.84092, std: 0.00517, params: {'n_neighbors': 70},
 mean: 0.84064, std: 0.00410, params: {'n_neighbors': 80},
 mean: 0.83908, std: 0.00465, params: {'n_neighbors': 90}]

# Using Keras

In [41]:
from keras.models import Sequential
twitter_model = Sequential()

In [42]:
from keras.layers import Dense
twitter_model.add(Dense(32, input_shape=(82394,)))

In [43]:
from keras.layers import Activation
twitter_model.add(Activation('sigmoid'))

In [44]:
twitter_model.add(Dense(1))

In [45]:
twitter_model.compile(optimizer='SGD', loss='binary_crossentropy', metrics = ['accuracy'])

In [46]:
twitter_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 32)                2636640   
_________________________________________________________________
activation_2 (Activation)    (None, 32)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 2,636,673
Trainable params: 2,636,673
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Changing from sparse matrix to dense
A= X_train_dtm
A = A.todense()

In [None]:
twitter_model.fit(A, y_train, epochs=1)

Epoch 1/1

In [None]:
# Changing from sparse matrix to dense
B=X_test_dtm
B = B.todense()

In [None]:
#Evaluating
twitter_model.evaluate(B, y_test)

