# Hate Speech Detection

In [6]:
!pip install nltk

Collecting nltk


[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2023.12.25-cp310-cp310-win_amd64.whl.metadata (41 kB)
     -------------------------------------- 42.0/42.0 kB 225.1 kB/s eta 0:00:00
Collecting tqdm (from nltk)
  Downloading tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 57.6/57.6 kB 1.0 MB/s eta 0:00:00
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 1.5/1.5 MB 3.1 MB/s eta 0:00:00
Downloading regex-2023.12.25-cp310-cp310-win_amd64.whl (269 kB)
   ---------------------------------------- 269.5/269.5 kB 2.4 MB/s eta 0:00:00
Downloading tqdm-4.66.2-py3-none-any.whl (78 kB)
   ---------------------------------------- 78.3/78.3 kB 872.5 kB/s eta 0:00:00
Installing collected packages: tqdm, regex, nltk
Successfully installed nltk-3.8.1 regex-2023.12.25 tqdm-4.66.2


## Importing essential libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Natural Language toolkits

In [15]:
import re
import nltk
nltk.download('stopwords')
from nltk.util import pr
stemmer = nltk.SnowballStemmer('english')
from nltk.corpus import stopwords
import string
stopword = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [104]:
df = pd.read_csv('twitter_data.csv')
df = df.drop('Unnamed: 0', axis = 1)

In [105]:
df.head(10)

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."
6,3,0,3,0,1,"!!!!!!""@__BrighterDays: I can not just sit up ..."
7,3,0,3,0,1,!!!!&#8220;@selfiequeenbri: cause I'm tired of...
8,3,0,3,0,1,""" &amp; you might not get ya bitch back &amp; ..."
9,3,1,2,0,1,""" @rhythmixx_ :hobbies include: fighting Maria..."


In [106]:
df['class'].unique()

array([2, 1, 0], dtype=int64)

In [107]:
df['tweet']

0        !!! RT @mayasolovely: As a woman you shouldn't...
1        !!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2        !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3        !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4        !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
                               ...                        
24778    you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779    you've gone and broke the wrong heart baby, an...
24780    young buck wanna eat!!.. dat nigguh like I ain...
24781                youu got wild bitches tellin you lies
24782    ~~Ruffled | Ntac Eileen Dahlia - Beautiful col...
Name: tweet, Length: 24783, dtype: object

In [108]:
#Assigning labels to each of the class
df['labels'] = df['class'].map({0:'Hate Speech Detected', 1:'Offensive language detected', 2:'No hate and offensive speech'})

In [109]:
#Extracting the table
df = df[['tweet','labels']]

In [110]:
df

Unnamed: 0,tweet,labels
0,!!! RT @mayasolovely: As a woman you shouldn't...,No hate and offensive speech
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive language detected
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Offensive language detected
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Offensive language detected
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Offensive language detected
...,...,...
24778,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,Offensive language detected
24779,"you've gone and broke the wrong heart baby, an...",No hate and offensive speech
24780,young buck wanna eat!!.. dat nigguh like I ain...,Offensive language detected
24781,youu got wild bitches tellin you lies,Offensive language detected


## Cleaning the Data

In [111]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation),'', text)
    text = re.sub('\n', '',text)
    text = re.sub('\w*\d\w*', '',text)
    text = [word for word in text.split(' ') if word not in stopword]
    text = ' '.join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = ' '.join(text)
    return text

df['tweet'] = df['tweet'].apply(clean)

In [112]:
df

Unnamed: 0,tweet,labels
0,rt mayasolov woman shouldnt complain clean ho...,No hate and offensive speech
1,rt boy dat coldtyga dwn bad cuffin dat hoe ...,Offensive language detected
2,rt urkindofbrand dawg rt ever fuck bitch sta...,Offensive language detected
3,rt cganderson vivabas look like tranni,Offensive language detected
4,rt shenikarobert shit hear might true might f...,Offensive language detected
...,...,...
24778,yous muthafin lie coreyemanuel right tl tras...,Offensive language detected
24779,youv gone broke wrong heart babi drove redneck...,No hate and offensive speech
24780,young buck wanna eat dat nigguh like aint fuck...,Offensive language detected
24781,youu got wild bitch tellin lie,Offensive language detected


## Training the model

In [113]:
x = np.array(df['tweet'])
y = np.array(df['labels'])

cv = CountVectorizer()
x = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.33, random_state = 42)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [114]:
pred = clf.predict(X_test)

In [115]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,pred)

In [116]:
accuracy

0.8738232057708767

## Making Predictions

In [119]:
test_data = "you are bad i don't like you"
df = cv.transform([test_data]).toarray()
print(clf.predict(df))

['Offensive language detected']
