In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
train_orig=pd.read_csv("/kaggle/input/twitter-sentiment-analysis-hatred-speech/train.csv")
test_nolabel=pd.read_csv("/kaggle/input/twitter-sentiment-analysis-hatred-speech/test.csv")

**Let us do some pre-processing. Without preprocessing results are:  (Avoid looking at these metrics in the beginning, will be explained in the end of notebook)**
<pre>
               precision    recall  f1-score   support
 
            0       0.95      1.00      0.97     14880
            1       0.85      0.35      0.49      1101
 
     accuracy                           0.95     15981
    macro avg       0.90      0.67      0.73     15981
 weighted avg       0.95      0.95      0.94     15981
 
 [[14815    65]
 [  718   383]]
</pre>

**New metric:**
<pre>
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     14848
           1       0.88      0.40      0.55      1133

    accuracy                           0.95     15981
   macro avg       0.92      0.70      0.76     15981
weighted avg       0.95      0.95      0.95     15981

[[14786    62]
 [  683   450]]
</pre>

**New report with stratification enabled. Shows further improvement in results
**<pre>
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     14860
           1       0.89      0.42      0.57      1121

    accuracy                           0.96     15981
   macro avg       0.92      0.71      0.77     15981
weighted avg       0.95      0.96      0.95     15981

[[14800    60]
 [  650   471]]
</pre>

**Classification report after upsampling the minority classes. Look at updated values for label 1**
<pre>
              precision    recall  f1-score   support

           0       0.98      0.91      0.94     14860
           1       0.92      0.98      0.95     14860

    accuracy                           0.94     29720
   macro avg       0.95      0.94      0.94     29720
weighted avg       0.95      0.94      0.94     29720

[[13542  1318]
 [  345 14515]]
</pre>

In [None]:
from nltk.corpus import stopwords
from nltk import word_tokenize
import string
import re
stop_words = set(stopwords.words('english'))

train = train_orig

def remove_stopwords(line):
    word_tokens = word_tokenize(line)
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    return " ".join(filtered_sentence)

def preprocess(line):
    line = line.lower()  #convert to lowercase
    line = re.sub(r'\d+', '', line)  #remove numbers
    line = line.translate(line.maketrans("","", string.punctuation))  #remove punctuation
#     line = line.translate(None, string.punctuation)  #remove punctuation
    line = remove_stopwords(line)
    return line
for i,line in enumerate(train.tweet):
    train.tweet[i] = preprocess(line)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train['tweet'], train['label'], test_size=0.5, stratify=train['label'])

trainp=train[train.label==1]
trainn=train[train.label==0]
print(trainp.info())
trainn.info()

In [None]:
# Let us balance the dataset
train_imbalanced = train
from sklearn.utils import resample
df_majority = train[train.label==0]
df_minority = train[train.label==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
print("Before")
print(train.label.value_counts())
print("After")
print(df_upsampled.label.value_counts())

X_train, X_test, y_train, y_test = train_test_split(df_upsampled['tweet'], df_upsampled['label'], test_size=0.5, stratify=df_upsampled['label'])

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
# Xtext=train.tweet
# Xtest=test.tweet
# y=train.label
# test
# ytest=test.label

**Convert text data to numerical data**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vect = CountVectorizer()
tf_train=vect.fit_transform(X_train)  #train the vectorizer, build the vocablury
tf_test=vect.transform(X_test)  #get same encodings on test data as of vocabulary built

In [None]:
tf_test_nolabel=vect.transform(test_nolabel.tweet)

In [None]:
# print(tf_train)
# vect.get_feature_names()[:10] #print few features only to avoid slowing down the notebook

In [None]:
model.fit(X=tf_train,y=y_train)

In [None]:
expected = y_test
predicted=model.predict(tf_test)

In [None]:
from sklearn import metrics

print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

In [None]:
from mlxtend.plotting import plot_confusion_matrix

plot_confusion_matrix(metrics.confusion_matrix(expected, predicted))

In [None]:
print(trainp.iloc[:10])
trainn.iloc[:10]

In [None]:
gg=X_test.reset_index(drop=True)
# print(gg)
for i, p in enumerate(predicted):
#     print(i)
    print (gg[i] + " - " + str(p))
    if i>5:
        break #to avoid a lot of printing and slowing down the notebook

In [None]:
predicted_nolabel=model.predict(tf_test_nolabel)
for i, p in enumerate(tf_test_nolabel):
#     print(i)
    print (test_nolabel.tweet[i] + " - " + str(predicted_nolabel[i]))
    if i>5:
        break #to avoid a lot of printing and slowing down the notebook

In [None]:
test_custom=pd.DataFrame(["racist", "white judge trial", "it is a horrible incident", "@user #white #supremacists want everyone to see the new â  #birdsâ #movie â and hereâs why", " @user #white #supremacists want everyone to see the new â  #birdsâ #movie â and hereâs why", "@user  at work: attorneys for white officer who shot #philandocastile remove black judge from presiding over trial. htâ¦"])
tf_custom = vect.transform(test_custom[0])
model.predict(tf_custom)