In [1]:
import pandas as pd
import re
import nltk.corpus
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np

### Preprocessing

In [None]:
train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
train.shape

In [None]:
test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test.shape

In [37]:
test_labels =  pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')

In [None]:
# collect stopwords dictionary from NLTK 
nltk.download('stopwords')
nltk.download('punkt')
stop = stopwords.words('english')
nltk.download('wordnet')

In [None]:
def clean_text(raw_text):
    raw_text = raw_text.lower() # lowercase
    raw_text = re.sub(r'[^\w\s]', '', raw_text) # remove punctuation
    raw_text = re.sub(r'[0-9]+', '', raw_text) # remove numbers
    raw_text = raw_text.replace('\n', ' ')
    tokens = word_tokenize(raw_text)
    tokens_filtered = [x for x in tokens if not x in stop]
    lemmatizer = WordNetLemmatizer()
    cleaned_text = []
    for x in tokens_filtered:
        x = lemmatizer.lemmatize(x)
        cleaned_text.append(x)
    return ' '.join(cleaned_text)

In [None]:
# preprocess train_data
train['X'] = ''
for i in range(0, train.shape[0]):
    train.at[i,'X'] = clean_text(train['comment_text'][i])
# preprocess test_data
test['X'] = ''
for i in range(0, test.shape[0]):
    test.at[i,'X'] = clean_text(test['comment_text'][i])
train.to_csv('train_processed.csv')
test.to_csv('test_processed.csv')

### Baseline model: Naive Bayes

In [2]:
train = pd.read_csv('../input/toxic-comment-preprocessed-data/train_processed.csv')
test = pd.read_csv('../input/toxic-comment-preprocessed-data/test_processed.csv')

In [3]:
# TF-IDF Verctorization
word_vectorizer = TfidfVectorizer()
word_vectorizer.fit(train['X'].values.astype('U'))    
X = word_vectorizer.transform(train['X'].values.astype('U'))

In [23]:
# binary classifictaion with naive bayes - is toxic or not
X_train, X_test, y_train, y_test = train_test_split(X, train['toxic'], test_size=0.2, random_state=42, stratify=train['toxic'].values)
# undersampling

In [24]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_train_oversample, y_train_oversample = ros.fit_resample(X_train, y_train)

In [25]:
nb = MultinomialNB()
nb.fit(X_train_oversample,y_train_oversample)
nb_prediction = nb.predict(X_test)

In [27]:
import sklearn.metrics as skm
skm.classification_report(y_test,nb_prediction, output_dict=True)

In [28]:
# confusion_matrix drawing function retrieved from https://gist.github.com/mesquita/f6beffcc2579c6f3a97c9d93e278a9f1
from matplotlib.ticker import PercentFormatter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def cm_analysis(y_true, y_pred, filename, labels, classes, ymap=None, figsize=(15,10)):
    """
    Generate matrix plot of confusion matrix with pretty annotations.
    The plot image is saved to disk.
    args: 
      y_true:    true label of the data, with shape (nsamples,)
      y_pred:    prediction of the data, with shape (nsamples,)
      filename:  filename of figure file to save
      labels:    string array, name the order of class labels in the confusion matrix.
                 use `clf.classes_` if using scikit-learn models.
                 with shape (nclass,).
      classes:   aliases for the labels. String array to be shown in the cm plot.
      ymap:      dict: any -> string, length == nclass.
                 if not None, map the labels & ys to more understandable strings.
                 Caution: original y_true, y_pred and labels must align.
      figsize:   the size of the figure plotted.
    """
    sns.set(font_scale=2.8)

    if ymap is not None:
        y_pred = [ymap[yi] for yi in y_pred]
        y_true = [ymap[yi] for yi in y_true]
        labels = [ymap[yi] for yi in labels]
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.2f%%\n%d/%d' % (p, c, s)
            #elif c == 0:
            #    annot[i, j] = ''
            else:
                annot[i, j] = '%.2f%%\n%d' % (p, c)
    cm = confusion_matrix(y_true, y_pred, labels=labels, normalize='true')
    cm = pd.DataFrame(cm, index=labels, columns=labels)
    cm = cm * 100
    cm.index.name = 'True Label'
    cm.columns.name = 'Predicted Label'
    fig, ax = plt.subplots(figsize=figsize)
    plt.yticks(va='center')

    sns.heatmap(cm, annot=annot, fmt='', ax=ax, xticklabels=classes, cbar=True, cbar_kws={'format':PercentFormatter()}, yticklabels=classes, cmap="Blues")
    #plt.savefig(filename,  bbox_inches='tight')

In [29]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
nb_matrix = confusion_matrix(y_test, nb_prediction)
cm_analysis(y_test, nb_prediction, 'Naive_Bayes_2', [0,1], ['non-toxic','toxic'])

In [32]:
### cross validation oversampling
from imblearn.pipeline import make_pipeline as make_imb_pipeline
from sklearn.model_selection import cross_validate
oversample_pipe = make_imb_pipeline(RandomOverSampler(), MultinomialNB())
scores = cross_validate(oversample_pipe,
                        X_train, y_train, cv=10,
                        scoring=('roc_auc', 'average_precision'))
scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()

In [34]:
### cross validation undersampling
from imblearn.under_sampling import RandomUnderSampler
undersample_pipe = make_imb_pipeline(RandomUnderSampler(), MultinomialNB())
scores = cross_validate(undersample_pipe,
                        X_train, y_train, cv=10,
                        scoring=('roc_auc', 'average_precision'))
scores['test_roc_auc'].mean(), scores['test_average_precision'].mean()

Undersampling has a slightly better performance.

In [38]:
test_all = pd.merge(test, test_labels)
test_score = test_all[test_all['toxic'] != -1]

In [46]:
X_train_undersample, y_train_undersample = ros.fit_resample(X_train, y_train)
nb = MultinomialNB()
nb.fit(X_train_undersample,y_train_undersample)

test_X = word_vectorizer.transform(test_all['X'].values.astype('U'))
nb_prediction = nb.predict(test_X)

from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
nb_matrix = confusion_matrix(test_all['toxic'], nb_prediction)
cm_analysis(test_all['toxic'], nb_prediction, 'Naive_Bayes test_data', [0,1], ['non-toxic','toxic'])

In [66]:
#define metrics
figure(figsize=(8, 6), dpi=80)
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
y_pred_proba = nb.predict_proba(test_X)[::,1]
fpr, tpr, _ = metrics.roc_curve(test_all['toxic'], y_pred_proba, pos_label=1)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(10,10))
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.show()

In [73]:
# split internal train and test 
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
X_train, X_test, y_train, y_test = train_test_split(X, train[categories], test_size=0.2, random_state=42, stratify = train['toxic'])

In [None]:
nb_clf = BinaryRelevance(MultinomialNB())
nb_clf.fit(X_train,y_train)
nb_prediction = nb_clf.predict(X_test)

In [None]:
import sklearn.metrics as skm
skm.classification_report(y_test,nb_prediction, output_dict=True)
nb_report = pd.DataFrame(skm.classification_report(y_test,nb_prediction, output_dict=True)).transpose()
nb_report.to_csv('nb_report.csv')

### MLP-Keras

In [None]:
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split
from ast import literal_eval

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('../input/toxic-comment-preprocessed-data/train_processed.csv')
test = pd.read_csv('../input/toxic-comment-preprocessed-data/test_processed.csv')

In [None]:
train = train.dropna()

In [None]:
train = train[train['X'].apply(lambda x: isinstance(x, str))]

In [None]:
# split internal train and test 
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
X_train, X_test, y_train, y_test = train_test_split(train['X'], np.array(train[categories]), test_size=0.2, random_state=42,
                                                   stratify=train['toxic'].values)


In [None]:
max_seqlen = 40 # 75%
batch_size = 128
padding_token = "<pad>"
auto = tf.data.AUTOTUNE


def make_dataset(dataframe, is_train=True):
    labels = tf.ragged.constant(dataframe["Y"].values)
    #label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["X"].values, labels.numpy())
    )
    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
    return dataset.batch(batch_size)

In [None]:
train_df = pd.DataFrame({'X': X_train})
test_df = pd.DataFrame({'X': X_test})

In [None]:
y_train = [np.array(y) for y in y_train]
y_test = [np.array(y) for y in y_test]

In [None]:
train_df['Y'] = y_train
test_df['Y'] = y_test

In [None]:
train_dataset = make_dataset(train_df, is_train=True)
test_dataset = make_dataset(test_df, is_train=False)

In [None]:
for i, text in enumerate(text_batch[:5]):
    label = label_batch[i].numpy()[None, ...]
    print(f"Abstract: {text}")
    print(label[0])

In [None]:
# Source: https://stackoverflow.com/a/18937309/7636462
vocabulary = set()
train_df["X"].str.lower().str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)
print(vocabulary_size)

In [None]:
text_vectorizer = layers.TextVectorization(
    max_tokens=vocabulary_size, ngrams=2, output_mode="tf_idf"
)

# `TextVectorization` layer needs to be adapted as per the vocabulary from our
# training set.
with tf.device("/CPU:0"):
    text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

train_dataset = train_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)
test_dataset = test_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)

In [None]:
def make_model():
    shallow_mlp_model = keras.Sequential(
        [
            layers.Dense(512, activation="relu"),
            layers.Dense(256, activation="relu"),
            layers.Dense(6, activation="sigmoid"),
        ]  # More on why "sigmoid" has been used here in a moment.
    )
    return shallow_mlp_model

In [None]:
epochs = 20

shallow_mlp_model = make_model()
shallow_mlp_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=["categorical_accuracy"]
)

In [None]:
history = shallow_mlp_model.fit(
    train_dataset, validation_data=test_dataset, epochs=epochs
)

In [None]:
def plot_result(item):
    plt.plot(history.history[item], label=item)
    plt.plot(history.history["val_" + item], label="val_" + item)
    plt.xlabel("Epochs")
    plt.ylabel(item)
    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()


plot_result("loss")
plot_result("categorical_accuracy")

In [None]:
_, categorical_acc = shallow_mlp_model.evaluate(test_dataset)
print(f"Categorical accuracy on the test set: {round(categorical_acc * 100, 2)}%.")

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils.vis_utils import plot_model

In [None]:
plot_model(shallow_mlp_model, to_file='shallow_mlp_model.png', show_shapes=True, show_layer_names=False)

In [None]:
test_all = pd.merge(test, test_labels)
test_score = test_all[test_all['toxic'] != -1]

#### Performance on test data

In [None]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
X_train, X_test, y_train, y_test = train_test_split(test_score['X'], np.array(test_score[categories]), test_size=0.2, random_state=42,
                                                   stratify=test_score['toxic'].values)
X = X_train.append(X_test)
y = np.concatenate((y_train,y_test))
df = pd.DataFrame({'X': X})
y_ = [np.array(y1) for y1 in y]
df['Y'] = y_
df = df[df['X'].apply(lambda x: isinstance(x, str))]
dataset = make_dataset(df, is_train=False)
dataset = dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)
_, categorical_acc = shallow_mlp_model.evaluate(dataset)
print(f"Categorical accuracy on the test set: {round(categorical_acc * 100, 2)}%.")