# Import Data 

In [31]:
import pandas as pd 

In [32]:
train= pd.read_csv('train.csv')
print("training set :"% train.columns, train.shape , len(train))

training set : (31962, 3) 31962


In [33]:
test=pd.read_csv('test.csv')
print("test set:"% test.columns , test.shape, len(test))

test set: (17197, 2) 17197


# Data Cleaning

In [35]:
import re 
def clean_text(df, text_field):
    df[text_field]=df[text_field].str.lower()
    df[text_field]=df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem) )
    return df
test_clean = clean_text(test, "tweet")
train_clean = clean_text(train, "tweet")

In [37]:
print(train_clean)

          id  label                                              tweet
0          1      0    when a father is dysfunctional and is so sel...
1          2      0    thanks for lyft credit i cant use cause they...
2          3      0                                bihday your majesty
3          4      0  model   i love u take with u all the time in u...
4          5      0               factsguide society now    motivation
...      ...    ...                                                ...
31957  31958      0                               ate  isz that youuu 
31958  31959      0    to see nina turner on the airwaves trying to...
31959  31960      0  listening to sad songs on a monday morning otw...
31960  31961      1   sikh temple vandalised in in calgary wso cond...
31961  31962      0                        thank you  for you follow  

[31962 rows x 3 columns]


# Handling imbalanced data for hate speech detection model

In [38]:
from sklearn.utils import resample
train_majority=train_clean[train_clean.label==0]
train_minority=train_clean[train_clean.label==1]
train_minority_unsampled= resample(train_minority,
                                   replace= True,
                                   n_samples=len(train_majority),
                                   random_state=123)
train_unsampled= pd.concat([train_minority_unsampled,train_majority])
train_unsampled['label'].value_counts()



1    29720
0    29720
Name: label, dtype: int64

# Creating a pipeline

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
pipline_sgd= Pipeline([ ('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('nb', SGDClassifier())])

# Training the Hate speech Detection

In [41]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test= train_test_split(train_unsampled['tweet'],
                                                  train_unsampled['label'],random_state=0)

#  Predict the result 

In [44]:
model = pipline_sgd.fit(X_train, y_train)
y_predict= model.predict(X_test)
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)


0.9694061187762448