In [1]:
import pandas as pd
dataset = pd.read_csv('hate_speech.csv')
dataset.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [2]:
dataset.shape

(5242, 3)

In [3]:
dataset.label.value_counts()

label
0    3000
1    2242
Name: count, dtype: int64

In [4]:
for index,tweet in enumerate(dataset["tweet"][10:15]):
  print(index+1,"-",tweet)

1 -  â #ireland consumer price index (mom) climbed from previous 0.2% to 0.5% in may   #blog #silver #gold #forex
2 - we are so selfish. #orlando #standwithorlando #pulseshooting #orlandoshooting #biggerproblems #selfish #heabreaking   #values #love #
3 - i get to see my daddy today!!   #80days #gettingfed
4 - ouch...junior is angryð#got7 #junior #yugyoem   #omg 
5 - i am thankful for having a paner. #thankful #positive     


In [5]:
import re
def clean_text(text):

  text = re.sub('[^a-zA-Z]',' ',text)
  text = re.sub('[^\x00-\x7F]+', ' ',text)
  text = text.lower()
  return text

In [6]:
dataset['clean_text'] = dataset.tweet.apply(lambda x:clean_text(x))

In [7]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
len(stopwords.words('english'))

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


179

In [8]:
def gen_freq(text):
  word_list = []
  for tw_words in text.split():
    word_list.extend(tw_words)
  word_freq = pd.Series(word_list).value_counts()
  word_freq = word_freq.drop(stopwords.words('english'),errors='ignore')

  return word_freq

In [9]:
def any_neg(words):
  for word in words:
    if word in ['n','no','not','non'] or re.search(r"\wn't",word):
      return 1
  else:
    return 0

In [10]:
def any_rare(words,rare_100):
  for word in words:
    if word in rare_100:
      return 1
  else:
    return 0

In [11]:
def is_question(words):
  for word in words:
    if word in ['when','what','how','why','who']:
      return 1
  else:
    return 0

In [12]:
word_freq = gen_freq(dataset.clean_text.str)
rare_100 = word_freq[-100:]

dataset['word_count'] = dataset.clean_text.str.split().apply(lambda x:len(x))
dataset['any_neg'] = dataset.clean_text.str.split().apply(lambda x:any_neg(x))
dataset['any_rare'] = dataset.clean_text.str.split().apply(lambda x:any_rare(x,rare_100))
dataset['is_question'] = dataset.clean_text.str.split().apply(lambda x:is_question(x))
dataset['char_count'] = dataset.clean_text.apply(lambda x:len(x))

In [13]:
dataset.head(10)

Unnamed: 0,id,label,tweet,clean_text,word_count,any_neg,any_rare,is_question,char_count
0,1,0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is s...,18,0,0,1,102
1,2,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i can t us...,21,0,0,0,122
2,3,0,bihday your majesty,bihday your majesty,3,0,0,0,21
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ...,12,0,0,0,86
4,5,0,factsguide: society now #motivation,factsguide society now motivation,4,0,0,0,39
5,6,0,[2/2] huge fan fare and big talking before the...,huge fan fare and big talking before the...,18,0,0,1,116
6,7,0,@user camping tomorrow @user @user @user @use...,user camping tomorrow user user user use...,11,0,0,0,74
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams ...,21,0,0,0,143
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won love the land allin cavs champ...,10,0,0,0,87
9,10,0,@user @user welcome here ! i'm it's so #gr...,user user welcome here i m it s so gr...,10,0,0,0,50


In [14]:
from sklearn.model_selection import train_test_split
X = dataset[['word_count','any_neg','any_rare','is_question','char_count']]
y = dataset.label
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [15]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model = model.fit(X_train,y_train)
pred = model.predict(X_test)

In [16]:
model.predict(X_test[5:10])

array([1, 0, 1, 1, 0], dtype=int64)

In [17]:
from sklearn.metrics import accuracy_score
print("Accuracy:",accuracy_score(y_test,pred)*100,"%")

Accuracy: 44.804575786463296 %


In [18]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train,y_train)
rf_pred = clf_rf.predict(X_test).astype(int)

In [19]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
print(confusion_matrix(y_test,rf_pred))
print(classification_report(y_test,rf_pred))
print("Accuracy:",accuracy_score(y_test, rf_pred))

[[401 198]
 [223 227]]
              precision    recall  f1-score   support

           0       0.64      0.67      0.66       599
           1       0.53      0.50      0.52       450

    accuracy                           0.60      1049
   macro avg       0.59      0.59      0.59      1049
weighted avg       0.60      0.60      0.60      1049

Accuracy: 0.5986653956148713


In [21]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(class_weight='balanced')
logreg.fit(X_train, y_train)

In [23]:
y_pred = logreg.predict(X_test)

In [24]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.62      0.62       599
           1       0.49      0.49      0.49       450

    accuracy                           0.56      1049
   macro avg       0.55      0.55      0.55      1049
weighted avg       0.56      0.56      0.56      1049

