#Hate Speech Detection


In [10]:
import pandas as pd
import re
from sklearn.utils import resample

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split

## Dataset

In [18]:
train = pd.read_csv('/content/train.csv') # Training Dataset
train

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."



 **Label = 1 means tweet is categorised as a Hate speech**

**Label = 1 means tweet is not categorised as a Hate speech** 

In [19]:
test = pd.read_csv('/content/test.csv')   #Test Dataset
test

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."
...,...,...
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."


In [5]:
#function to return a clean text
def  cleaning_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df


In [20]:
#Cleaning
test_clean  = cleaning_text(test, "tweet")
train_clean = cleaning_text(train,"tweet")

train_clean

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so sel...
1,2,0,thanks for lyft credit i cant use cause they...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in u...
4,5,0,factsguide society now motivation
...,...,...,...
31957,31958,0,ate isz that youuu
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,sikh temple vandalised in in calgary wso cond...


In [27]:
train_clean["tweet"][0]

'  when a father is dysfunctional and is so selfish he drags his kids into his dysfunction   run'

## Resampling

In [21]:

train_major = train_clean[train_clean.label==0]
train_minor = train_clean[train_clean.label==1]

#upsampling the Data

train_minor_upsampled = resample(train_minor,replace=True,n_samples=len(train_major),random_state=123)
train_upsampled = pd.concat([train_minor_upsampled, train_major])

#The upsampled data

train_upsampled['label'].value_counts()

1    29720
0    29720
Name: label, dtype: int64

## Building the pipeline for SGDClassifier

In [11]:
pipeline_SGD = Pipeline(
    [('vect', CountVectorizer()),   # CountVectorizer
     ('tfidf',  TfidfTransformer()),    #tfidf
     ('nb', SGDClassifier()),])          #SGDClassifier

## Train Test Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    train_upsampled['tweet'],
    train_upsampled['label'],
    random_state = 0)

## Training model

In [14]:
model = pipeline_SGD.fit(X_train, y_train)

In [22]:
y_predict = model.predict(X_test)
y_predict[0:10]             #Predicting 10 tweets

array([0, 0, 0, 0, 1, 1, 0, 0, 1, 0])

In [17]:
#Calculating f1_score using the sklearn library
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.9694020398640091