# **Detection of Hatred Speech**

---





## **Loading** **Data**






In [None]:
import pandas as pd
train = pd.read_csv('train.csv')
print("Training Set:"% train.columns, train.shape, len(train))
test = pd.read_csv('test.csv')
print("Test Set:" % test.columns, test.shape, len(test))
# dataset.head()
train

Training Set: (31962, 3) 31962
Test Set: (17197, 2) 17197


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


# **Data** **Cleaning**

In [None]:
import re
def  clean_text(df, text):
    df[text] = df[text].str.lower()
    df[text] = df[text].apply(lambda el: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", el))  
    return df
test_clean = clean_text(test, "tweet")
train_clean = clean_text(train, "tweet")
train_clean

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so sel...
1,2,0,thanks for lyft credit i cant use cause they...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in u...
4,5,0,factsguide society now motivation
...,...,...,...
31957,31958,0,ate isz that youuu
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,sikh temple vandalised in in calgary wso cond...


# **Handling Imbalanced Data**


In [None]:
from sklearn.utils import resample
maj=train_clean[train_clean['label']==0]
min=train_clean[train_clean.label==1]
min_train=resample(min,replace=True,n_samples=len(maj),random_state=0)
train=pd.concat([min_train,maj])
train.label.value_counts()
train

Unnamed: 0,id,label,tweet
23718,23719,1,you might be a libtard if libtard sjw libera...
11832,11833,1,black judge quietly removed from philandocasti...
10849,10850,1,feminismiscancer feminismisterrorism feminism...
24771,24772,1,japan abe govt oppress freedomofspeech than ha...
14835,14836,1,when they call us homophobic misogynist th...
...,...,...,...
31956,31957,0,off fishing tomorrow carnt wait first time in...
31957,31958,0,ate isz that youuu
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...


# **Creating Pipelines**



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.linear_model import SGDClassifier
pipeline_sgd = Pipeline([
    ('vect', TfidfVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('n', SGDClassifier())])


from sklearn.linear_model import LogisticRegression
pipeline_lr = Pipeline([
    ('vect', TfidfVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('n', LogisticRegression())])

from sklearn.ensemble import RandomForestClassifier
pipeline_rfc = Pipeline([
    ('vect', TfidfVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('n', RandomForestClassifier())])

from sklearn.neighbors import KNeighborsClassifier
pipeline_knn = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('n', KNeighborsClassifier(n_neighbors=5))])

from sklearn.svm import SVC
pipeline_svc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('n', SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False,tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None))])

# **Training Model**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.tweet,train.label,random_state = 0)

## ***Using SGDClassifier***

In [None]:
%%time
model=pipeline_sgd.fit(X_train,y_train)
y_predict = model.predict(X_test)

print(f1_score(y_test, y_predict))

0.9781168440072274
CPU times: user 1.03 s, sys: 94 ms, total: 1.12 s
Wall time: 1.01 s


## ***Using LogisticRegression***

In [None]:
%%time
model=pipeline_lr.fit(X_train,y_train)
y_predict = model.predict(X_test)

print(f1_score(y_test, y_predict))

0.9808321645628799
CPU times: user 1.6 s, sys: 834 ms, total: 2.44 s
Wall time: 1.66 s


## ***Using RandomForestClassifier***

In [None]:
%%time
model=pipeline_rfc.fit(X_train,y_train)
y_predict = model.predict(X_test)

print(f1_score(y_test, y_predict))

0.9985773321590679
CPU times: user 39.7 s, sys: 30 ms, total: 39.7 s
Wall time: 39.8 s


## ***Using KNeighborsClassifier***

In [None]:
%%time
model=pipeline_knn.fit(X_train,y_train)
y_predict = model.predict(X_test)

print(f1_score(y_test, y_predict))

0.9785510326050867
CPU times: user 15.7 s, sys: 103 ms, total: 15.8 s
Wall time: 15.8 s


## ***Using SVC***

In [None]:
%%time
model=pipeline_svc.fit(X_train,y_train)
y_predict = model.predict(X_test)

print(f1_score(y_test, y_predict))

0.9977661950856292
CPU times: user 8min 59s, sys: 164 ms, total: 8min 59s
Wall time: 9min
