In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk.stem.wordnet import WordNetLemmatizer 
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
import os

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Toxic Comment Classification Challenge
**Identify and classify toxic online comments**

The data is a collection of comment text that has been classified throw six classes.
The competition consist on predicting negative online behaviours, like toxic comments (i.e. comments that are rude, disrespectful or otherwise likely to make someone leave a discussion). So the goal is to create a classification model that can perform the highest accuracy.

## Analyse the data

In [None]:
# Load the data
data = pd.read_csv('../input/train.csv')
data.head()

In [None]:
print("There is {} messages.".format(len(data)))

**Comment classes**

Let's look at the different classes and how many comment by class. That is clear we have an imbalanced data throw calsses. When we encounter such problems, we are bound to have difficulties solving them with standard algorithms. Conventional algorithms are often biased towards the majority class, not taking the data distribution into consideration. In the worst case, minority classes are treated as outliers and ignored. For some cases, such as fraud detection or cancer prediction, we would need to carefully configure our model or artificially balance the dataset, for example by undersampling or oversampling each class.

However, in our case of learning imbalanced data, the majority classes might be of our great interest. It is desirable to have a classifier that gives high prediction accuracy over the majority class, while maintaining reasonable accuracy for the minority classes. Therefore, we will leave it as it is.

In [None]:
classes = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
occurence = []
print("\n{:^15} | {:^15} | {:^5}".format("Class", "Occurrence", "%"))
print("*"*42)
for clas in classes:
    print("{:15} | {:>15} | {:^5.2f}".format(clas, 
                                             data[clas].value_counts()[1], 
                                             data[clas].value_counts()[1]*100/len(data)
                                            )
         )
    occurence.append(data[clas].value_counts()[1])

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(classes, occurence)
plt.title("Number of comments per category")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('category', fontsize=12)

**Remarks**

9.58% of the messages are considered toxic and 1% are considered severe toxic. 

But a message can belong to more than one class so let's take a look:

In [None]:
data['all'] = data[classes].sum(axis=1)
data['any'] = data['all'].apply(lambda x:1 if x>0 else 0)
data.head()

In [None]:
in_classes = data['all'].value_counts()
print("\n{:^10} | {:^10} | {:^6}".format("# Classes", "# Comment", "%"))
print("*"*33)
for idx in range(7):
    print("{:10} | {:>10} | {:>6.2f}".format(idx, 
                                             in_classes[idx], 
                                             in_classes[idx]*100/len(data)
                                            )
         )
print("*"*33)
print("{:^10} | {:>10} | {:>6}".format("", len(data), "100.00"))

**Remarks**

3.99% of messages belong only to one class and 2.18% belong to two classes. 

There is 31 messages that belong to all classes.

In [None]:
df = pd.DataFrame(in_classes.values)

In [None]:
ax = df.plot.bar(stacked=True, figsize=(10, 6), legend=False)
ax.set_ylabel('# of Occurrences', fontsize=12)
ax.set_xlabel('# of classes', fontsize=12)
ax.set_title("# of messages per # of classes associated")

**Examples of toxic message**

In [None]:
# toxic
data[data['toxic']==1].iloc[1,1]

In [None]:
# severe_toxic
data[data['severe_toxic']==1].iloc[2,1]

In [None]:
# obscene
data[data['obscene']==1].iloc[3,1]

In [None]:
# threat
data[data['threat']==1].iloc[4,1]

In [None]:
# insult
data[data['insult']==1].iloc[5,1]

In [None]:
# identity_hate
data[data['identity_hate']==1].iloc[6,1]

**Comment text behavior**

Let's look at the length of the comment text

In [None]:
lens = data['comment_text'].str.len()
lens.head()

In [None]:
# Statistics:
print('Minimum : ', lens.min())
print('Maximum : ', lens.max())
print('Median : ', lens.median())

In [None]:
# horizontal boxplot
plt.figure(figsize=(15,4))
plt.boxplot(lens, 0, 'gD', 0, showmeans=True)
# The length of comment text is varying a lot. There is a lot of outlier.

## Natural Language Processing (NLP)

In [None]:
# Split data using stratifying variable "all" to take into account the imbalanced data throw calsses
datatrain, datatest = train_test_split(data, test_size=0.2, stratify=data["all"], random_state=42)

### Text Preprocessing
#### Cleaning data (Noise Removal)
Any piece of text which is not relevant to the context of the data and the end-output can be specified as the noise.

For example – language stopwords (commonly used words of a language – is, am, the, of, in etc), URLs or links, social media entities (mentions, hashtags), punctuations and industry specific words. This step deals with removal of all types of noisy entities present in the text.

A general approach for noise removal is to prepare a dictionary of noisy entities, and iterate the text object by tokens (or by words), eliminating those tokens which are present in the noise dictionary.

In [None]:
# Here we create a list of noisy entities
useless_words = nltk.corpus.stopwords.words("english") + list(string.punctuation) + ["\'m"] + ["\'s"] + ["\'\'"] + ["``"] + ["n\'t"] + ["ca"]

#### Lexicon Normalization
* Stemming:  Stemming is a rudimentary rule-based process of stripping the suffixes (“ing”, “ly”, “es”, “s” etc) from a word.
* Lemmatization: Lemmatization, on the other hand, is an organized & step by step procedure of obtaining the root form of the word, it makes use of vocabulary (dictionary importance of words) and morphological analysis (word structure and grammar relations).

In [None]:
lem = WordNetLemmatizer()
def clean_data(txt):
    txt = nltk.word_tokenize(txt.lower())
    txt = [word for word in txt if not word in useless_words]
    txt = [lem.lemmatize(w, "v") for w in txt]
    return ' '.join(word for word in txt)

In [None]:
# datatest['comment_text'] = datatest['comment_text'].apply(lambda x:clean_data(x))
# datatrain['comment_text'] = datatrain['comment_text'].apply(lambda x:clean_data(x))

datatest['comment_text'] = datatest['comment_text'].apply(lambda x:clean_data(x))
datatrain['comment_text'] = datatrain['comment_text'].apply(lambda x:clean_data(x))

### Text to Features (Feature Engineering on text data)
To analyse a preprocessed data, it needs to be converted into features. Depending upon the usage, text features can be constructed using assorted techniques – Syntactical Parsing, Entities / N-grams / word-based features, Statistical features, and word embeddings.
#### Term Frequency – Inverse Document Frequency (TF – IDF)
TF-IDF is a weighted model commonly used for information retrieval problems. It aims to convert the text documents into vector models on the basis of occurrence of words in the documents without taking considering the exact ordering. For Example – let say there is a dataset of N text documents, In any document “D”, TF and IDF will be defined as –

Term Frequency (TF) – TF for a term “t” is defined as the count of a term “t” in a document “D”

Inverse Document Frequency (IDF) – IDF for a term is defined as logarithm of ratio of total documents available in the corpus and number of documents containing the term T.

In [None]:
def ROC_curve_plot(datatest, prediction, classes, figure_title):
    # Compute ROC curve and ROC area for each class
    nbr_classes = len(classes)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    y = np.zeros(nbr_classes*len(datatest))
    y_hat = np.zeros(nbr_classes*len(datatest))

    for idx,clas in enumerate(classes):
        print('... Processing {}'.format(clas))
        print('Cofusion Matrix:\n', confusion_matrix(datatest[clas], prediction[:,idx]))
        fpr[clas], tpr[clas], _ = roc_curve(datatest[clas], prediction[:,idx])
        roc_auc[clas] = auc(fpr[clas], tpr[clas])

        y[idx*len(datatest):(idx+1)*len(datatest)] = datatest[clas].values
        y_hat[idx*len(datatest):(idx+1)*len(datatest)] = prediction[:,idx]
        
    # Compute average ROC curve and ROC area
    fpr["all"], tpr["all"], _ = roc_curve(y, y_hat)
    roc_auc["all"] = auc(fpr["all"], tpr["all"])
    
    plt.figure(figsize=(10,10))
    for i in ["all"] + classes:
        plt.plot(fpr[i], tpr[i], label='{0} (area = {1:0.2f})'.format(i, roc_auc[i]))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate' , fontsize=12)
    plt.title(figure_title,           fontsize=12)
    plt.legend(loc="lower right",     fontsize=12)
    plt.show()

### Naive Bayes

In [None]:
NB_pipeline = Pipeline([
                        ('tfidf', TfidfVectorizer()),
                        ('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None))),
                       ])

NB_pipeline.fit(datatrain['comment_text'], datatrain[classes])
prediction = NB_pipeline.predict(datatest['comment_text'])

ROC_curve_plot(datatest, prediction, classes, 'ROC curve : Naive Bayes Classifier')

### LinearSVC

In [None]:
SVC_pipeline = Pipeline([
                         ('tfidf', TfidfVectorizer()),
                         ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
                        ])

SVC_pipeline.fit(datatrain['comment_text'], datatrain[classes])
prediction = SVC_pipeline.predict(datatest['comment_text'])

ROC_curve_plot(datatest, prediction, classes, 'ROC curve : Linear SVC Classifier')

### Logistic Regression

In [None]:
LogReg_pipeline = Pipeline([
                            ('tfidf', TfidfVectorizer()),
                            ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
                           ])

LogReg_pipeline.fit(datatrain['comment_text'], datatrain[classes])
prediction = LogReg_pipeline.predict(datatest['comment_text'])

ROC_curve_plot(datatest, prediction, classes, 'ROC curve : Logistic Regression Classifier')

### Random Forest

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# RandomForest_pipeline = Pipeline([
#                             ('tfidf', TfidfVectorizer()),
#                             ('clf', OneVsRestClassifier(RandomForestClassifier(), n_jobs=1)),
#                            ])

# RandomForest_pipeline.fit(datatrain['comment_text'], datatrain[classes])
# prediction = RandomForest_pipeline.predict(datatest['comment_text'])

# ROC_curve_plot(datatest, prediction, classes, 'ROC curve : Random Forest Classifier')

### XGBoost

In [None]:
from xgboost import XGBClassifier
XGBoost_pipeline = Pipeline([
                            ('tfidf', TfidfVectorizer()),
                            ('clf', OneVsRestClassifier(XGBClassifier(), n_jobs=1)),
                           ])

XGBoost_pipeline.fit(datatrain['comment_text'], datatrain[classes])
prediction = XGBoost_pipeline.predict(datatest['comment_text'])

ROC_curve_plot(datatest, prediction, classes, 'ROC curve : XGBoost Classifier')

In [None]:
### Decision tree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
DecisionTree_pipeline = Pipeline([
                            ('tfidf', TfidfVectorizer()),
                            ('clf', OneVsRestClassifier(DecisionTreeClassifier())),
                           ])

DecisionTree_pipeline.fit(datatrain['comment_text'], datatrain[classes])
prediction = DecisionTree_pipeline.predict(datatest['comment_text'])

ROC_curve_plot(datatest, prediction, classes, 'ROC curve : Decision Tree Classifier')

In [None]:
### Multi-layer Perceptron

In [None]:
# from sklearn.neural_network import MLPClassifier
# MLPClassifier_pipeline = Pipeline([
#                             ('tfidf', TfidfVectorizer()),
#                             ('clf', OneVsRestClassifier(MLPClassifier())),
#                            ])

# MLPClassifier_pipeline.fit(datatrain['comment_text'], datatrain[classes])
# prediction = MLPClassifier_pipeline.predict(datatest['comment_text'])

# ROC_curve_plot(datatest, prediction, classes, 'ROC curve : Multi-layer Perceptron Classifier')

                                        Area Under the ROC Curve Table

| Classifier | ALL   |Toxic  | Severe Toxic   |Obscene  | Threat   |Insult  | Identity Hate   |
|------|------|------|------|------|------|------|------|
|   Naive Bayes | 0.55|   0.58  | 0.50|   0.55  | 0.50|   0.52  | 0.50|
|   LinearSVC  | 0.81|   0.84  | 0.64|   0.86  | 0.57|   0.78  | 0.62|
|   Logistic Regression  | 0.77|   0.80  | 0.62|   0.82  | 0.53|   0.75  | 0.58|
|   Random Forest  | 0.72|   0.74  | 0.53|   0.77  | 0.51|   0.70  | 0.53|
|   XGBoost  | 0.71|   0.71  | 0.55|   0.78  | 0.55|   0.70  | 0.57|
|   Decision tree classifier  | 0.82|   0.83  | 0.61|   0.88  | 0.61|   0.79  | 0.67|
|   Multi-layer Perceptron  | 0.81|   0.84  | 0.64|   0.86  | 0.61|   0.78  | 0.66|