### Importing the required libraries ###

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re
import zipfile


### UNZIP files ###

In [None]:
# Will unzip the files so that you can see them..
with zipfile.ZipFile("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip","r") as z:
    z.extractall(".")


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/working/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Reading the Train File ###

In [None]:
# prepare text samples and their labels
print('Loading in comments...')

data = pd.read_csv("/kaggle/working/train.csv")
print(data.head())

In [None]:
# Feature Imformation 
data.columns

In [None]:
# Data Dimension 

data.shape 

In [None]:
cols_target = ['obscene','insult','toxic','severe_toxic','identity_hate','threat']

In [None]:
# Check Missing Value 

print(data["comment_text"].isna().sum())

# dropna 

In [None]:
# check missing values in numeric columns
data.describe()

In [None]:
unlabelled_in_all = data[(data['toxic']!=1) & (data['severe_toxic']!=1) &
                             (data['obscene']!=1) & (data['threat']!=1) &
                             (data['insult']!=1) & (data['identity_hate']!=1)]
print('Percentage of unlabelled comments or good comments is ', len(unlabelled_in_all)/len(data)*100)

In [None]:
labelled_in_all = data[(data['toxic']==1) & (data['severe_toxic']==1) &
                             (data['obscene']==1) & (data['threat']==1) &
                             (data['insult']==1) & (data['identity_hate']==1)]
print('Percentage of comments which is present in all categories is ', len(labelled_in_all)/len(data)*100)

In [None]:
# let's see the total rows in train, test data and the numbers for the various categories
print('Total rows in train is {}'.format(len(data)))
print(data[cols_target].sum())

Next, let's examine the correlations among the target variables.

In [None]:
target_data = data[cols_target]
colormap = plt.cm.plasma
plt.figure(figsize=(7,7))
plt.title('Correlation of features & targets',y=1.05,size=14)
sns.heatmap(target_data.astype(float).corr(),linewidths=0.1,vmax=1.0,square=True,cmap=colormap,
           linecolor='white',annot=True)

Indeed, it looks like some of the labels are higher correlated, e.g. insult-obscene has the highest at 0.74, followed by toxic-obscene and toxic-insult.

### Now this kind of problem is ###

1) Multi class problem and not Binary

2) Also all classes are not independent but rather dependent or correlated 

3) A comment can belong to multiple classes at the same time for e.g. comment can be toxic and insulting at the same time

Let us simplify the problem by first classifying the comments as "block" vs "allow" 

In [None]:
data['block'] =data[cols_target].sum(axis =1)
print(data['block'].value_counts())
data['block'] = data['block'] > 0 
data['block'] = data['block'].astype(int)
print(data['block'].value_counts())


In [None]:
# look at the count plot for text length
sns.set()
sns.countplot(x="block" , data = data )
plt.show()

In [None]:
# Event Rate 

print("Percentage Event Rate : " , round(100*data['block'].sum()/data.shape[0],2) , "%")

### Let us focus on comments  ###

In [None]:
# Let's look at the character length for the rows in the training data and record these
data['char_length'] = data['comment_text'].apply(lambda x: len(str(x)))

In [None]:
# look at the histogram plot for text length
sns.set()
data['char_length'].hist()
plt.show()

Most of the text length are within 500 characters, with some up to 5,000 characters long.



### Clean the Comments Text ###

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
%%time 
# clean the comment_text in train_df [Thanks to Pulkit Jha for the useful pointer.]
data['comment_text'] = data['comment_text'].map(lambda com : clean_text(com))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['comment_text'], data['block'], test_size=0.2, random_state=42)

In [None]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)


In [None]:
# import and instantiate TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features = 10000, stop_words='english')
#vect = TfidfVectorizer(stop_words='english')
print(vect)

In [None]:
%%time 
# learn the vocabulary in the training data, then use it to create a document-term matrix
X_train_dtm = vect.fit_transform(X_train)
# examine the document-term matrix created from X_train
X_train_dtm

In [None]:
X_train_dtm.shape

In [None]:
100*2792162/ (127656*10000)

In [None]:
%%time
# transform the test data using the earlier fitted vocabulary, into a document-term matrix
X_test_dtm = vect.transform(X_test)
# examine the document-term matrix from X_test
X_test_dtm

## Lets us build a binary classifier using Logistic Regression ##

In [None]:
# import and instantiate the Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
logreg = LogisticRegression(C=1, max_iter = 2000)



# train the model using X_train_dtm & y_train
logreg.fit(X_train_dtm, y_train)
# compute the training accuracy
y_pred_train = logreg.predict(X_train_dtm)
print('Training accuracy is {}'.format(accuracy_score(y_train, y_pred_train)))
# compute the predicted probabilities for X_test_dtm
y_pred_test = logreg.predict(X_test_dtm)
print('Test accuracy is {}'.format(accuracy_score(y_test,y_pred_test)))
print(confusion_matrix(y_test,y_pred_test))


In [None]:
#28507 -> comments  are good and predeicted as good 
#2014 -> comments are block and predicted as block
#164 -> comments are good but predicted as block
#1230 -> comments are block but predicted as good


In [None]:
(28507 + 2014)/(28507+2014+164+1230)


In [None]:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = logreg.predict_proba(X_test_dtm)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()



# Welcome to the curse of Accuracy, F1(help) to the rescue #

In [None]:
from sklearn.metrics import f1_score


print("F1 score on Test data : " ,f1_score(y_test,y_pred_test))
    


### In case of Class Imbalance - we use F1 score as a general measure for the model performance ###

Depending on the Business case - we need to fine tune the model 

There is a Precision vs Recall Trade off 

If you want to capture all toxic tweets  - then some of the good twwets will be misclassified as bad tweets 

In [None]:
y_pred_test = logreg.predict_proba(X_test_dtm)[:,1]
#print(y_pred_test)
y_pred_test = y_pred_test >= 0.2 # by default it is 0.5
y_pred_test = y_pred_test.astype(int)
print('Test accuracy is {}'.format(accuracy_score(y_test,y_pred_test)))
print(confusion_matrix(y_test,y_pred_test))
print("F1 score on Test data : " ,f1_score(y_test,y_pred_test))

# Let us use a tree base model #

In [None]:
%%time 

from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier 

dt_clf = DecisionTreeClassifier()
# train the model using X_train_dtm & y_train
dt_clf.fit(X_train_dtm, y_train)
# compute the training accuracy
y_pred_train = dt_clf.predict(X_train_dtm)
print('Training accuracy is {}'.format(accuracy_score(y_train, y_pred_train)))
# compute the predicted probabilities for X_test_dtm
y_pred_test = dt_clf.predict(X_test_dtm)
print('Test accuracy is {}'.format(accuracy_score(y_test,y_pred_test)))
print(confusion_matrix(y_test,y_pred_test))
print("F1 score on Test data : " ,f1_score(y_test,y_pred_test))

### Lets us try an Ensemble of Trees ###

In [None]:
%%time 
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier 

rf_clf = RandomForestClassifier()

# train the model using X_train_dtm & y_train
rf_clf.fit(X_train_dtm, y_train)
# compute the training accuracy
y_pred_train = rf_clf.predict(X_train_dtm)
print('Training accuracy is {}'.format(accuracy_score(y_train, y_pred_train)))
# compute the predicted probabilities for X_test_dtm
y_pred_test = rf_clf.predict(X_test_dtm)
print('Test accuracy is {}'.format(accuracy_score(y_test,y_pred_test)))
print(confusion_matrix(y_test,y_pred_test))
print("F1 score on Test data : " ,f1_score(y_test,y_pred_test))

In [None]:
# Fine Tuning Random Forest 

y_pred_test = rf_clf.predict_proba(X_test_dtm)[:,1]
y_pred_test = y_pred_test >= 0.05 # by default it is 0.5
y_pred_test = y_pred_test.astype(int)
print('Test accuracy is {}'.format(accuracy_score(y_test,y_pred_test)))
print(confusion_matrix(y_test,y_pred_test))
print("F1 score on Test data : " ,f1_score(y_test,y_pred_test))

In [None]:
%%time
from sklearn.metrics import f1_score
from sklearn.linear_model  import PassiveAggressiveClassifier 

pa_clf = PassiveAggressiveClassifier()

# train the model using X_train_dtm & y_train
pa_clf.fit(X_train_dtm, y_train)
# compute the training accuracy
y_pred_train = pa_clf.predict(X_train_dtm)
print('Training accuracy is {}'.format(accuracy_score(y_train, y_pred_train)))
# compute the predicted probabilities for X_test_dtm
y_pred_test = pa_clf.predict(X_test_dtm)
print('Test accuracy is {}'.format(accuracy_score(y_test,y_pred_test)))
print(confusion_matrix(y_test,y_pred_test))
print("F1 score on Test data : " ,f1_score(y_test,y_pred_test))

### Passive Aggresive Classifier does not support prediction probability - so can't be fined ###

In [None]:
%%time 
from sklearn.metrics import f1_score
import xgboost 

xgb = xgboost.XGBClassifier()
# train the model using X_train_dtm & y_train
xgb.fit(X_train_dtm, y_train)
# compute the training accuracy
y_pred_train = xgb.predict(X_train_dtm)
print('Training accuracy is {}'.format(accuracy_score(y_train, y_pred_train)))
# compute the predicted probabilities for X_test_dtm
y_pred_test = xgb.predict(X_test_dtm)
print('Test accuracy is {}'.format(accuracy_score(y_test,y_pred_test)))
print(confusion_matrix(y_test,y_pred_test))
print("F1 score on Test data : " ,f1_score(y_test,y_pred_test))

In [None]:
# Fine Tuning XGBOOST

y_pred_test = xgb.predict_proba(X_test_dtm)[:,1]
y_pred_test = y_pred_test >= 0.06 # by default it is 0.5
y_pred_test = y_pred_test.astype(int)
print('Test accuracy is {}'.format(accuracy_score(y_test,y_pred_test)))
print(confusion_matrix(y_test,y_pred_test))
print("F1 score on Test data : " ,f1_score(y_test,y_pred_test))

### Advance Models - LightGBM ### 

In [None]:
import lightgbm 

parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

train_data = lightgbm.Dataset(X_train_dtm, label=y_train)
test_data = lightgbm.Dataset(X_test_dtm, label=y_test)

clf = lightgbm.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=500,
                       early_stopping_rounds=10)






In [None]:
# Fine Tuning LIGHT GBM

y_pred_test = clf.predict(X_test_dtm)
y_pred_test = y_pred_test >= 0.35 # by default it is 0.5
y_pred_test = y_pred_test.astype(int)
print('Test accuracy is {}'.format(accuracy_score(y_test,y_pred_test)))
print(confusion_matrix(y_test,y_pred_test))
print("F1 score on Test data : " ,f1_score(y_test,y_pred_test))

## Model Explanation ##

In [None]:
import eli5

eli5.show_weights(logreg,vec = vect, top = 15)  # logistic regression
# will give you top 15 features or words  which makes a comment toxic 

In [None]:
eli5.show_weights(xgb,vec = vect,top = 15)  # XGBoost
# will give you top 15 features or words  which makes a comment toxic 

## Tweets Explanation ##

In [None]:
X_test.iloc[718]

In [None]:
eli5.show_prediction(logreg, vec = vect, doc =  X_test.iloc[718]) 