In [120]:
# Importing libraries
# Importing numpy to perform operations on arrays
import numpy as np 
# mporting pandas to read csv files to perform data processing
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# For data preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# NLP tools
# Importing re to perform regex operations
import re
import nltk
# Importing stopwords which does not hold any value in a corpus to be removed
from nltk.corpus import stopwords
# Importing PorterStemmer to stem given words from the database to their original source words
from nltk.stem.porter import PorterStemmer
# Imporing CountVectorizer to build a vocabulary if not none to consider features ordered by term frequency
from sklearn.feature_extraction.text import CountVectorizer

# Train split and fit models
# Importing train_test_split to split the dataset into small sets as training and testing data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Model selection
# Importing confusion_matrix to provide a comapared summary of ground value and predicted observations by the given classifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [121]:
# importing the dataset 
dataset = pd.read_csv('/Users/poornankpurohit/Desktop/Data.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,id,text,label,type,model_wrong,db.model_preds,status,round,split,annotator
0,1,10000,It should be legal for a husband to rape his wife,hate,notgiven,True,0.97457,dynabench entry,1,train,gnzuctwed3
1,2,10001,dalits are lowlife scum,hate,notgiven,False,0.08233,dynabench entry,1,test,cagnluiznm
2,3,10002,dalits are lowlives,hate,notgiven,True,0.92319,dynabench entry,1,train,cagnluiznm
3,4,10003,It was a better world when women dare not ques...,hate,notgiven,True,0.99006,dynabench entry,1,test,gnzuctwed3
4,5,10004,I don't work this hard so that those immigrant...,hate,notgiven,True,0.98836,dynabench entry,1,train,cagnluiznm


In [122]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40623 entries, 0 to 40622
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      40623 non-null  int64  
 1   id              40623 non-null  object 
 2   text            40623 non-null  object 
 3   label           40623 non-null  object 
 4   type            40623 non-null  object 
 5   model_wrong     26097 non-null  object 
 6   db.model_preds  26097 non-null  float64
 7   status          40623 non-null  object 
 8   round           40623 non-null  object 
 9   split           40623 non-null  object 
 10  annotator       40623 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 3.4+ MB


In [123]:
dataset.describe()

Unnamed: 0.1,Unnamed: 0,db.model_preds
count,40623.0,26097.0
mean,20371.983236,0.585455
std,11743.590183,0.448801
min,1.0,1e-05
25%,10206.5,0.03656
50%,20374.0,0.91029
75%,30534.5,0.9955
max,40740.0,0.99992


In [124]:
# data preprocessing 
# encoding the dependent variable
dt_trasformed = dataset[['label', 'text']]
y = dt_trasformed.iloc[:, :-1].values
# onehotencoder to encode potential multiple two or more values
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
y = np.array(ct.fit_transform(y))

In [125]:
print(y)

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [126]:
y_df = pd.DataFrame(y)
y_hate = np.array(y_df[0])
y_offensive = np.array(y_df[1])

In [127]:
print(y_hate)
print(y_offensive)

[1. 1. 1. ... 1. 1. 1.]
[0. 0. 0. ... 0. 0. 0.]


In [128]:
# text cleaning and transforming in vector form
corpus = []
for i in range(0, 40623):
  review = re.sub('[^a-zA-Z]', ' ', dt_trasformed['text'][i])
  review = review.lower()
  review = review.split()
# porter stemmer to stem document for word processing 
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

In [129]:
# text into vector based on the frequency of each word that appears through text
cv = CountVectorizer(max_features = 2000)
X = cv.fit_transform(corpus).toarray()

In [130]:
#Splitting the dataset into two smaller sets as training and testing with value being 70% to 30% respectively
X_train, X_test, y_train, y_test = train_test_split(X, y_hate, test_size = 0.30, random_state = 0)

In [131]:
# implementing Naive Bayes model of machine learning to predict hate speech
# using GaussianNb which can handel continuous data 
classifier_np = GaussianNB()
classifier_np.fit(X_train, y_train)

GaussianNB()

In [132]:
# implementing confusion matrix to define classification of algorithm
# Confusion Matrix for Naive Bayes model
y_pred_np = classifier_np.predict(X_test)
cm = confusion_matrix(y_test, y_pred_np)
print(cm)

[[3048 2618]
 [2043 4478]]


In [133]:
np_score = accuracy_score(y_test, y_pred_np)
print('Naive Bayes Accuracy:', str(np_score*100))

Naive Bayes Accuracy: 61.75432838270288
