In [82]:
# Importing libraries
# Importing numpy to perform operations on arrays
import numpy as np 
# mporting pandas to read csv files to perform data processing
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# For data preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
m
# NLP tools
# Importing re to perform regex operations
import re
import nltk
# Importing stopwords which does not hold any value in a corpus to be removed
from nltk.corpus import stopwords
# Importing PorterStemmer to stem given words from the database to their original source words
from nltk.stem.porter import PorterStemmer
# Imporing CountVectorizer to build a vocabulary if not none to consider features ordered by term frequency
from sklearn.feature_extraction.text import CountVectorizer

# Train split and fit models
# Importing train_test_split to split the dataset into small sets as training and testing data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Model selection
# Importing confusion_matrix to provide a comapared summary of ground value and predicted observations by the given classifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [83]:
# importing the dataset 
dataset = pd.read_csv('/Users/poornankpurohit/Desktop/labeled_data.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [84]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [85]:
dataset.describe()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class
count,24783.0,24783.0,24783.0,24783.0,24783.0,24783.0
mean,12681.192027,3.243473,0.280515,2.413711,0.549247,1.110277
std,7299.553863,0.88306,0.631851,1.399459,1.113299,0.462089
min,0.0,3.0,0.0,0.0,0.0,0.0
25%,6372.5,3.0,0.0,2.0,0.0,1.0
50%,12703.0,3.0,0.0,3.0,0.0,1.0
75%,18995.5,3.0,0.0,3.0,0.0,1.0
max,25296.0,9.0,7.0,9.0,9.0,2.0


In [96]:
# data preprocessing 
# encoding the dependent variable
dt_trasformed = dataset[['class', 'tweet']]
y = dt_trasformed.iloc[:, :-1].values
print(y)
# onehotencoder to encode potential multiple two or more values
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
y = np.array(ct.fit_transform(y))

[[2]
 [1]
 [1]
 ...
 [1]
 [1]
 [2]]


In [87]:
print(y)

[[0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 ...
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [88]:
# divided the data into hate speech and offensive speech
y_df = pd.DataFrame(y)
y_hate = np.array(y_df[0])
y_offensive = np.array(y_df[1])

In [89]:
print(y_hate)
print(y_offensive)

[0. 0. 0. ... 0. 0. 0.]
[0. 1. 1. ... 1. 1. 0.]


In [90]:
# text stemming and transforming in vector form
corpus = []
for i in range(0, 24783):
  review = re.sub('[^a-zA-Z]', ' ', dt_trasformed['tweet'][i])
  review = review.lower()
  review = review.split()
  # porter stemmer to stem document for word processing 
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

In [91]:
# text into vector based on the frequency of each word that appears through text
cv = CountVectorizer(max_features = 2000)
X = cv.fit_transform(corpus).toarray()

In [92]:
#Splitting the dataset into two smaller sets as training and testing with value being 70% to 30% respectively
X_train, X_test, y_train, y_test = train_test_split(X, y_hate, test_size = 0.30, random_state = 0)

In [93]:
# implementing Naive Bayes model of machine learning to predict hate speech
# using GaussianNb which can handel continuous data 
classifier_np = GaussianNB()
classifier_np.fit(X_train, y_train)

GaussianNB()

In [94]:
# implementing confusion matrix to define classification of algorithm
# Confusion Matrix for Naive Bayes model
y_pred_np = classifier_np.predict(X_test)
cm = confusion_matrix(y_test, y_pred_np)
print(cm)

[[3289 3719]
 [ 168  259]]


In [95]:
np_score = accuracy_score(y_test, y_pred_np)
print('Naive Bayes Accuracy:', str(np_score*100))

Naive Bayes Accuracy: 47.720242098184265
