## Project Overview

Use of Social media has been increasing day by day and also hate Speech is also increasing along with number of users. So it is tough challenge for companies to monitor each and every tweet of users, so we are developing a machine learning model to identify the hate speech tweets automatically which saves lot of resources for companies

<i>Please upvote  and share if this helps you!! Also, feel free to fork this kernel to play around with the code and test it for yourself.</i>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
import string
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("../input/twitter-sentiment-analysis-hatred-speech/train.csv")

In [None]:
df.head()

In [None]:
df.info()

**Observations:**
* There are `0` Null value in data

In [None]:
df.drop("id",axis = 1,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
#distributions
df_Stat=df[['label','tweet']].groupby('label').count().reset_index()
df_Stat.columns=['label','count']
df_Stat['percentage']=(df_Stat['count']/df_Stat['count'].sum())*100
df_Stat

----

In [None]:
df['length'] = df['tweet'].apply(len)
df.head(10)

In [None]:
#Exploratory Data Analysis

sns.barplot('label','length',data = df,palette='PRGn')
plt.title('Average Word Length vs label')
plt.show()

In [None]:
sns.countplot(x= 'label',data = df,palette="PRGn")
plt.title('Label Counts')
plt.show()

**Character count**

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(15,5))
sns.histplot(df[df["label"] == 1]["length"],bins = 30,ax = ax1, kde=True).set(title = "normsl tweets")
sns.histplot(df[df["label"] == 0]["length"],bins = 30,ax = ax2, kde = True).set(title = "Hate tweets")
plt.show()

**The distribution of both seems to be almost same. 90 to 120 characters in a tweet are the most common among both.**

In [None]:
nltk.download('stopwords')

### Preprocessing the tweet column

In [None]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

def process_tweet(tweet):
    tweet =  " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",tweet.lower()).split())
    tweet = nltk.word_tokenize(tweet)
    stemmer = PorterStemmer()
    stem = [stemmer.stem(word) for word in tweet]
    words = [word for word in stem if word not in stopwords.words('english')]
    tweet = " ".join(words)
    
    return tweet

df["clean_tweet"] = df["tweet"].apply(process_tweet)
df.head()

**Most frequent Word in tweet**

In [None]:
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

text = " ".join(review for review in df.clean_tweet)
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="black").generate(text)
fig = plt.figure(figsize = (10, 10)) 
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("To Create Cloud of words for all words")
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(use_idf=True)
X = vectorizer.fit_transform(df["clean_tweet"])

In [None]:
# df1 = pd.DataFrame(X)
# df1.columns = vectorizer.get_feature_names()
# df1.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df["label"].unique()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,df["label"],test_size = 0.2,random_state = 42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier,BaggingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score

clf_A = LogisticRegression()
clf_B = AdaBoostClassifier()
clf_C = DecisionTreeClassifier()
clf_D = SVC()
clf_E = RandomForestClassifier()
clf_F = MultinomialNB()
clfs = [clf_A,clf_B,clf_C,clf_D,clf_E,clf_F]

In [None]:
df_score = pd.DataFrame(index=None, columns=['model','recall_score','precision_score','f1_score','accuracy-score'])
for clf in clfs:
    clf.fit(X_train,y_train)
    pred = clf.predict(X_test)
    score1 = recall_score(y_test,pred)
    score2 = precision_score(y_test,pred)
    score3 = f1_score(y_test,pred)
    acuracy_score = accuracy_score(y_test,pred)
  
    df_score = df_score.append(pd.Series({
                "model" : clf.__class__.__name__,
                "recall_score" : score1,
                "precision_score" : score2,
                "f1_score" : score3,
                "accuracy-score" : acuracy_score}),ignore_index = True)

df_score

**Here, Accurate Model is DecisionTreeClassifier.**

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'max_depth': [1,2,5,10,15,50],
    'max_features': ['auto','sqrt','log2'],
    'criterion' : ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_leaf_nodes': [1,2,5,10],
}

from sklearn.metrics import make_scorer
scorer = make_scorer(recall_score)

grid_obj = GridSearchCV(clf_C, parameters, scoring=scorer, cv=5,verbose = 1,n_jobs = -1)
# Fit the data
grid_fit = grid_obj.fit(X_train, y_train)
best_clf = grid_fit.best_estimator_

In [None]:
best_clf.get_params()