In [1]:
# Data Link-- https://www.kaggle.com/arkhoshghalb/twitter-sentiment-analysis-hatred-speech

## IMPORT LIBRARIES...

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import re
import nltk
from tqdm import tqdm
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
#loading the data
df=pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/train.csv')

In [None]:
print(df.shape)   #check the shape of the data

In [None]:
print(df.head())

In [None]:
print(df.info())

In [None]:
df['label'].value_counts()

In [None]:
plt.style.use('ggplot')

In [None]:
# sn.set(font_scale=1.5)
sn.set_style('whitegrid');
ax=sn.barplot(x=['Pos','Neg'], y=df.label.value_counts());
ax.set_title('DISTRIBUTION IN DATASET ACCORDING TO THE SENTIMENT',loc='center', pad=20, fontdict={'fontsize': 15,
        'fontweight': 'bold',
        'color': 'black',
        'verticalalignment': 'baseline',
        });
ax.set(xlabel='Tweets Sentiments', ylabel='No. of Tweets');
plt.show()
ax.legend()
ax.figure.savefig('Total-positive-negative-counts.png',pad_inches=5)

We can observe that there are more reviews with 0 label i.e. tweet is not racist/sexist.<br>
So our dataset is imbalanced

In [None]:
y_value_counts=df['label'].value_counts()
print("Negative tweets  = ",y_value_counts[1], "with percentage ", (y_value_counts[1]*100)/(y_value_counts[0]+y_value_counts[1]),'%')
print("Positive tweets  = ",y_value_counts[0], "with percentage ", (y_value_counts[0]*100)/(y_value_counts[0]+y_value_counts[1]),'%')

In [None]:
# sn.set(font_scale=1.5)
sn.set_style('whitegrid');
ax=sn.barplot(x=['Pos','Neg'], y=df.label.value_counts()*100/df.label.value_counts().sum(), palette='Greys_d');
ax.set_title('% DISTRIBUTION IN DATASET ACCORDING TO THE SENTIMENT',loc='center', pad=20, fontdict={'fontsize': 15,
        'fontweight': 'bold',
        'color': 'black',
        'verticalalignment': 'baseline',
        });
ax.set(xlabel='Tweets Sentiments', ylabel='No. of Tweets');
plt.show()
ax.legend()
ax.figure.savefig('Total-positive-negative-counts.png',pad_inches=5)

In [None]:
#lets see the classes through bar graph
data=dict(negative=y_value_counts[1],positive=y_value_counts[0])
cls=data.keys()
value=data.values()

plt.bar(cls,value,color='maroon',width=0.2)

***From the bar graph we can clearly see that there are more not racist tweets than the racist tweets.***

## Data Preprocessing

Since the data is in text format, we have to preprocess the data and clean the data to vectorize the data.

First we will replace the all blank spaces, - with underscore and convert all the letters to lower case.

In [None]:
df['tweet']=df['tweet'].str.replace(' ','_')
df['tweet']=df['tweet'].str.replace('-','_')
df['tweet']=df['tweet'].str.lower()

In [None]:
df.tweet[:10]

In [None]:
def expand(sent):
    "This function will replace english short notations with full form"
    
    sent=re.sub(r"can't", "can not",sent)
    sent=re.sub(r"won't", "will not",sent)
    
    sent=re.sub(r"n\'t", " not",sent)
    sent=re.sub(r"\'re", " are",sent)
    sent=re.sub(r"\'m"," am",sent)
    sent=re.sub(r"\'s"," is",sent)
    sent=re.sub(r"\'ll"," will",sent)
    sent=re.sub(r"\'ve"," have",sent)
    sent=re.sub(r"\'d"," would",sent)
    sent=re.sub(r"\'t", " not",sent)
    
    return sent
    

In [None]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [None]:
def preprocess_tweet(text):
    "function for preprocess the text data"
    
    preprocessed_tweet=[]
    
    for sentence in tqdm(text):
        sent=expand(sentence)
        sent=sent.replace("\\r"," ")
        sent=sent.replace("\\n"," ")
        sent=sent.replace('\\"'," ")
        sent=re.sub("[^A-Za-z0-9]+"," ",sent)
        
        # https://gist.github.com/sebleier/554280
        sent=" ".join(i for i in sent.split() if i.lower() not in stopwords)
        preprocessed_tweet.append(sent.lower().strip())
        
    return preprocessed_tweet
        

In [None]:
preprocessed_tweets=preprocess_tweet(df['tweet'].values)

In [None]:
df['tweet']=preprocessed_tweets

In [None]:
df["tweet"][10]

In [None]:
word_list=[]
for i in df["tweet"]:
    for j in i.split(' '):
        word_list.append(j)
    

In [None]:
counter = Counter(word_list)
top10 = counter.most_common(11)

In [None]:
top10

In [None]:
xaxes = [i[1] for i in top10]
yaxes = [i[0] for i in top10]

In [None]:
# sn.set(font_scale=1.5)
sn.set_style('whitegrid');
sn.set(rc={'figure.figsize':(11.7,8.27)})
ax=sn.barplot(x=xaxes, y=yaxes);
ax.set_title('TOP WORDS IN BUILT WORDLIST',loc='center', pad=20, fontdict={'fontsize': 15,
        'fontweight': 'bold',
        'color': 'black',
        'verticalalignment': 'baseline',
        });
# ax.set(xlabel='Tweets Sentiments', ylabel='No. of Tweets');
plt.show()
ax.legend()
ax.figure.savefig('top10wordscount.png',pad_inches=5)

In [None]:
df[df['label']==0].tweet

In [None]:
positive_word=[]
negative_word=[]
for i in df[df['label']==0].tweet:
    for j in i.split(' '):
        positive_word.append(j)
for i in df[df['label']==1].tweet:
    for j in i.split(' '):
        negative_word.append(j)

In [None]:
positive_counter = Counter(positive_word)
negative_counter = Counter(negative_word)

In [None]:
positive_counter.most_common(10),negative_counter.most_common(10)

In [None]:
negative_counter.get('friday')

In [None]:
w = ['good','bad','information']
y1 = []
y2=[]
for i in w:
    y1.append(positive_counter.get(i))
    y2.append(negative_counter.get(i))

In [None]:
y2

In [None]:

X_axis = np.arange(len(w))
  
plt.bar(X_axis-0.2 , y1, 0.4, label = 'Pos')
plt.bar(X_axis+0.2 , y2, 0.4, label = 'Neg')
  
plt.xticks(X_axis, w)
plt.xlabel("Words", fontdict={'family': 'serif',
        'color':  'darkred',
        'weight': 'normal',
        'size': 16,
        })
plt.ylabel("Number of Words", fontdict={'family': 'serif',
        'color':  'darkred',
        'weight': 'normal',
        'size': 16,
        })
plt.title("Most Common Words Across Sentiments", fontdict={'family': 'serif',
        'color':  'darkblue',
        'weight': 'bold',
        'size': 18,
        })
plt.legend()
plt.show()

In [None]:
positive_counter.most_common(10), negative_counter.most_common(10)

##### Now the text data is cleaned

### Splitting data into train and test

In [None]:
y=df['label']
x=df.drop(['label'],axis=1)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=40)


In [None]:
x_test

### Vectorization...

#### TFIDF for text data

In [None]:
vect=TfidfVectorizer(min_df=10)

vect.fit(x_train['tweet'].values)

train_tweet=vect.transform(x_train['tweet'].values)
test_tweet=vect.transform(x_test['tweet'].values)

print(train_tweet.shape,y_train.shape)
print(test_tweet.shape,y_test.shape)

In [None]:
#calculating sentiment scores for train data
x_train_sent=np.ndarray.tolist(x_train["tweet"].values)

sia=SentimentIntensityAnalyzer()
ps=[]
for i in range(len(x_train_sent)):
    ps.append((sia.polarity_scores((x_train_sent[i]))))
    
x_train_polarity=np.array(ps)
x_train_polarity=x_train_polarity.reshape(-1,1)
x_train_polarity.shape


In [None]:
#storing only scores of sentiment
x_t=[]
for i in range(len(x_train)):
    for j in x_train_polarity[0][0]:
        x_t.append(x_train_polarity[i][0][j])
x_t=np.array(x_t)
x_t=x_t.reshape(-1,4)
x_t.shape

In [None]:
#calculating sentiment scores for test data
x_test_sent=np.ndarray.tolist(x_test["tweet"].values)

sia=SentimentIntensityAnalyzer()
ps=[]
for i in range(len(x_test_sent)):
    ps.append((sia.polarity_scores((x_test_sent[i]))))
    
x_test_polarity=np.array(ps)
x_test_polarity=x_test_polarity.reshape(-1,1)
x_test_polarity.shape


In [None]:
#storing only scores of sentiment
x_tests=[]
for i in range(len(x_test)):
    for j in x_test_polarity[0][0]:
        x_tests.append(x_test_polarity[i][0][j])
x_tests=np.array(x_tests)
x_tests=x_tests.reshape(-1,4)
x_tests.shape

##### Convert the vectors into scipy.sparse matrix

In [None]:
from scipy.sparse import hstack

In [None]:
x_tr=hstack((train_tweet,x_t))
x_te=hstack((test_tweet,x_tests))

print(x_tr.shape)
print(x_te.shape)

In [None]:
print(test_tweet)

***Now we are ready with the data.***

### DecisionTreeClassifier()

In [None]:
wt={0:1,1:5}            #since the data is imbalanced , we assign some more weight to class 1

clf=DecisionTreeClassifier(class_weight=wt)

parameters=dict(max_depth=[1,5,10,50],min_samples_split=[5,10,100,500])

search=RandomizedSearchCV(clf,parameters,random_state=10)
result=search.fit(x_tr,y_train)
result.cv_results_

In [None]:
search.best_params_

In [None]:
cls = DecisionTreeClassifier(max_depth=50,min_samples_split=5,random_state=10,class_weight=wt)
cls.fit(x_tr,y_train)

In [None]:
y_pred_train=cls.predict(x_tr)
y_pred_test=cls.predict(x_te)

In [None]:
train_fpr,train_tpr,tr_treshold=roc_curve(y_train,y_pred_train)
test_fpr,test_tpr,te_treshold=roc_curve(y_test,y_pred_test)

train_auc=auc(train_fpr,train_tpr)
test_auc=auc(test_fpr,test_tpr)

plt.plot(train_fpr,train_tpr,label='Train AUC = '+str(train_auc))
plt.plot(test_fpr,test_tpr,label='Test AUC = '+str(test_auc))
plt.legend()
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title("AUC_Curve")
plt.grid()
plt.show()

***We got auc score= 0.7625***

In [None]:
def find_best_threshold(threshold, fpr, tpr):
    """it will give best threshold value that will give the least fpr"""
    t = threshold[np.argmax(tpr*(1-fpr))]
    
    # (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
    print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
    
    return t

def predict_with_best_t(proba, threshold):
    """this will give predictions based on best threshold value"""
    predictions = []
    for i in proba:
        if i>=threshold:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions

In [None]:
#computing confusion matrix for set_1

from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_treshold, train_fpr, train_tpr)
print("Train confusion matrix")
m_tr=(confusion_matrix(y_train, predict_with_best_t(y_pred_train, best_t)))
print(m_tr)
print("Test confusion matrix")
m_te=(confusion_matrix(y_test, predict_with_best_t(y_pred_test, best_t)))
print(m_te)

In [None]:
print(classification_report(y_test, y_pred_test))

In [None]:
df3 = pd.DataFrame(classification_report(y_pred_test, 
                                        y_test, digits=2,
                                        output_dict=True)).T

df3['support'] = df1.support.apply(int)

df3.style.background_gradient(cmap='viridis',
                             subset=pd.IndexSlice['0':'9', :'f1-score'])


In [None]:
dt_df = pd.DataFrame(columns=['tweet', 'sentiment-predicted', 'label'])

In [None]:
def color_negative_red(value):
    """
    Colors elements in a dateframe
    green if positive and red if
    negative. Does not color NaN
    values.
    """

    if value == 'Pos':
        color = 'green'
    else:
        color='red'

    return 'color: %s' % color

In [None]:
dt_df = pd.DataFrame(columns=['tweet', 'sentiment-predicted', 'label'])
dt_df['tweet'] = x_test['tweet']
dt_df['sentiment-predicted'] = y_pred_test
dt_df['label'] =y_test
dt_df.replace(to_replace=[0,1],value=['Pos','Neg'], inplace=True)
(dt_df.sample(10)[['tweet','sentiment-predicted','label']].style
    .applymap(color_negative_red, subset=['sentiment-predicted','label']))

## NAIVE BAYES

### CountVectorizer()

In [None]:
vec=CountVectorizer(min_df=10)
vec.fit(x_train['tweet'].values)

x_tr_count=vec.transform(x_train['tweet'].values)
x_te_count=vec.transform(x_test['tweet'].values)
x_tr_count.shape

In [None]:
x_tr_data=hstack((x_tr_count,x_t))
x_te_data=hstack((x_te_count,x_tests))

x_trn=scipy.sparse.csr_matrix(x_tr_count)
x_tst=scipy.sparse.csr_matrix(x_te_count)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
mod = MultinomialNB()
mod.fit(x_trn,y_train)

In [None]:
train_pred=mod.predict(x_trn)
test_pred=mod.predict(x_tst)

In [None]:
train_fpr,train_tpr,tr_treshold=roc_curve(y_train,train_pred)
test_fpr,test_tpr,te_treshold=roc_curve(y_test,test_pred)

train_auc=auc(train_fpr,train_tpr)
test_auc=auc(test_fpr,test_tpr)

plt.plot(train_fpr,train_tpr,label='Train AUC = '+str(train_auc))
plt.plot(test_fpr,test_tpr,label='Test AUC = '+str(test_auc))
plt.legend()
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title("AUC_Curve")
plt.grid()
plt.show()

***TEST AUC = 0.8157***

In [None]:
#get the summary of this model

print(classification_report(test_pred, y_test))

In [None]:
df1 = pd.DataFrame(classification_report(mod.predict(x_tst), 
                                        y_test, digits=2,
                                        output_dict=True)).T

df1['support'] = df1.support.apply(int)

df1.style.background_gradient(cmap='viridis',
                             subset=pd.IndexSlice['0':'9', :'f1-score'])


In [None]:
dt_df = pd.DataFrame(columns=['tweet', 'sentiment-predicted', 'label'])
dt_df['tweet'] = x_test['tweet']
dt_df['sentiment-predicted'] = test_pred
dt_df['label'] =y_test
dt_df.replace(to_replace=[0,1],value=['Pos','Neg'], inplace=True)
(dt_df.sample(10)[['tweet','sentiment-predicted','label']].style
    .applymap(color_negative_red, subset=['sentiment-predicted','label']))

## XGBOOST

In [None]:
from xgboost import XGBClassifier

In [None]:
y_train.value_counts()

#### Hyperparameter Tuning

In [None]:
xg=XGBClassifier(use_label_encoder=False)
param=dict(max_depth=[4,6,8,10],n_estimators=[100,500,1000,1500])
search=RandomizedSearchCV(xg,param,random_state=10,)
srch=search.fit(x_tr,y_train,)
srch.cv_results_

In [None]:
srch.best_estimator_

In [None]:
xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, monotone_constraints='()',
              n_estimators=500, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None,).fit(x_tr, y_train)

prediction = xgb.predict(x_te) 

f1_score(y_test, prediction)

In [None]:
train_prediction=xgb.predict(x_tr)

In [None]:
train_fpr,train_tpr,tr_treshold=roc_curve(y_train,train_prediction)
test_fpr,test_tpr,te_treshold=roc_curve(y_test,prediction)

train_auc=auc(train_fpr,train_tpr)
test_auc=auc(test_fpr,test_tpr)

plt.plot(train_fpr,train_tpr,label='Train AUC = '+str(train_auc))
plt.plot(test_fpr,test_tpr,label='Test AUC = '+str(test_auc))
plt.legend()
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title("AUC_Curve")
plt.grid()
plt.show()

***TEST AUC = 0.7280***

In [None]:
print(classification_report(y_test, prediction))

In [None]:
df2 = pd.DataFrame(classification_report(y_test, prediction, digits=2,
                                        output_dict=True)).T

df2['support'] = df2.support.apply(int)

df2.style.background_gradient(cmap='viridis',
                             subset=pd.IndexSlice['0':'1', :'f1-score'])


In [None]:
dt_df = pd.DataFrame(columns=['tweet', 'sentiment-predicted', 'label'])
dt_df['tweet'] = x_test['tweet']
dt_df['sentiment-predicted'] = prediction
dt_df['label'] =y_test
dt_df.replace(to_replace=[0,1],value=['Pos','Neg'], inplace=True)
(dt_df.sample(10)[['tweet','sentiment-predicted','label']].style
    .applymap(color_negative_red, subset=['sentiment-predicted','label']))

## SUMMARY

### DECISION TREE

                   precision    recall  f1-score   support

              0       0.97      0.95      0.96      5945
              1       0.48      0.57      0.52       448

    accuracy                               0.93      6393
    macro avg          0.72      0.76      0.74      6393
    weighted avg       0.93      0.93      0.93      6393


### NAIVE BAYES

                   precision    recall  f1-score   support

              0       0.97      0.97      0.97      5945
              1       0.63      0.66      0.64       448

    accuracy                               0.95      6393
    macro avg          0.80      0.82      0.81      6393
    weighted avg       0.95      0.95      0.95      6393


### XGBOOST

                     precision    recall  f1-score   support

               0       0.96      0.99      0.98      5945
               1       0.77      0.47      0.58       448

    accuracy                               0.95      6393
    macro avg          0.87      0.73      0.78      6393
    weighted avg       0.95      0.95      0.95      6393

​

|MODEL|TEST AUC|
|----|----|
|DECISION TREE|0.7625|
|NAIVE BAYES|0.8157|
|XGBOOST|0.7280|

In [None]:
training_size = 25569
XGB  = ['XGBOOST',25569,'NA',0.7280, 0.95]
nb  = ['NAIVE BAYES',25569,'NA',0.8157, 0.95]
XGB  = ['DECISION TREE',25569,'NA',0.7625, 0.93]

In [None]:
pd.DataFrame