### Mayank Singh
### s2002mayank@gmail.com
### github link: https://github.com/s2002mayank/hate-Speech-detection

### Hate speech detection is a critical task in natural language processing (NLP) aimed at identifying and mitigating harmful content online.
### This project involves preprocessing text data, feature extraction, training machine learning models, and evaluating their performance in detecting hate speech.

In [1]:
# !pip install nltk==3.8.1
# nltk.download('punkt')
# nltk.download('stopwords')


In [2]:
import numpy as np
import pandas as pd

In [3]:
dataset=pd.read_csv("twitter.csv")
dataset

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...
24778,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,6,0,6,0,1,youu got wild bitches tellin you lies


In [4]:
dataset.isnull().sum()

count                       0
hate_speech_count           0
offensive_language_count    0
neither_count               0
class                       0
tweet                       0
dtype: int64

In [5]:
dataset['labels']= dataset['class'].map({0: "hate speech or offensive language",
                                         1: "hate speech or offensive language",
                                         2: "no hate or offensive language"})

In [6]:
data= dataset[["tweet", "labels"]]

In [7]:
import re 
import nltk
import string

In [8]:
stopwords = set(nltk.corpus.stopwords.words("english"))


In [9]:
stopwords.add("rt")
stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [10]:
stemmer= nltk.SnowballStemmer("english")

In [11]:
def data_clean(text):
    text=str(text).lower()
    # text=re.sub("https?://S+www\.S+", "", text)
    # text=re.sub("\[.*?\]","", text)
    # text=re.sub("<.*?>+","", text)
    # text=re.sub("[%s]"% re.escape(string.punctuation),"", text)
    text=re.sub("\n","", text)
    # text=re.sub("\w*\d\w*","", text)
    words= [stemmer.stem(word) for word in text.split(' ') if word and word not in stopwords]
    text=" ".join(words)
    return text
    

In [12]:
data["tweet"]=data["tweet"].apply(data_clean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["tweet"]=data["tweet"].apply(data_clean)


In [13]:
X=np.array(data["tweet"])
y=np.array(data["labels"])

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [15]:
cv=CountVectorizer()
X=cv.fit_transform(X)

In [16]:
X

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 222772 stored elements and shape (24783, 34382)>

In [17]:
x_train, x_test, y_train, y_test=train_test_split(X,y, test_size=0.3)

In [18]:
from sklearn.tree import DecisionTreeClassifier

In [19]:
dt=DecisionTreeClassifier()
dt.fit(x_train, y_train)

In [20]:
y_pred=dt.predict(x_test)

In [21]:
from sklearn.metrics import classification_report, confusion_matrix

In [22]:
cm=confusion_matrix(y_test,  y_pred)
cm

array([[5994,  200],
       [ 225, 1016]])

In [23]:
print(classification_report(y_test, y_pred))

                                   precision    recall  f1-score   support

hate speech or offensive language       0.96      0.97      0.97      6194
    no hate or offensive language       0.84      0.82      0.83      1241

                         accuracy                           0.94      7435
                        macro avg       0.90      0.89      0.90      7435
                     weighted avg       0.94      0.94      0.94      7435

