In [None]:
train_path = '/kaggle/input/cleaned-toxic-comments/train_preprocessed.csv'
test_path = '/kaggle/input/cleaned-toxic-comments/test_preprocessed.csv'

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

Labels: `toxic`, `severe_toxic`, `obscene`, `threat`, `insult`, `identity_hate`

In [None]:
df = pd.read_csv(train_path)
df = df.drop(['id','set','toxicity'], axis=1)

print("df.shape =", df.shape)
df.head()

# Explore data

In [None]:
labels = list(df.columns)
labels.remove('comment_text')

In [None]:
ax = df[labels].sum(axis=0).plot(kind='bar', title='number in each label');
for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x() * 1.01, p.get_height() * 1.001))

# Down sample

In [None]:
df['categorized'] = df.iloc[:,1:].sum(axis=1).apply(bool)
df['categorized'].value_counts()

There are 143,346 uncategorized rows and 16,225 categorized rows. We decide to drop 100,000 uncategorized rows.

In [None]:
categorized_rows = df[df['categorized'] == True]
uncategorized_rows = df[df['categorized'] == False].sample(frac=0.3)

df = categorized_rows.append(uncategorized_rows)
df['categorized'].value_counts()

# Baseline Model

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)
df['comment_text'] = df['comment_text'].apply(removeStopWords)

stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence
df['comment_text'] = df['comment_text'].apply(stemming)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

MAX_VOCAB = 500

encoder = layers.experimental.preprocessing.TextVectorization(
                    max_tokens=MAX_VOCAB, standardize='lower_and_strip_punctuation'
                )
sequences = df["comment_text"].values
targets = df[labels].values
encoder.adapt(sequences)

In [None]:
print(encoder.get_vocabulary()[:20])

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        mask_zero=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(targets.shape[1], activation='sigmoid')
])

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=[tf.keras.metrics.CategoricalCrossentropy()])

In [None]:
history = model.fit(sequences, targets, epochs=10,
                    batch_size=32,
                    validation_split=0.2)

In [None]:
pred = model.predict(sequences)

In [None]:
from sklearn.metrics import classification_report
THRESH = 0.5
for i in range(len(labels)):
    y_true = targets[:,i]
    y_pred = (pred[:,i] > THRESH).astype(int)
    print(f"======={labels[i]}")
    print(classification_report(y_true, y_pred))

### How many incorrectly classified? 

In [None]:
from scipy import stats
y_pred = (pred > THRESH)

difference = tf.math.logical_xor(tf.cast(targets, dtype=bool), y_pred)
difference = difference.numpy().sum(axis=1)
print("difference stats :\n\t",stats.describe(difference))

### From all real ones, <br> how many of them can model correctly predict as one?

In [None]:
m = tf.keras.metrics.Recall()
m.update_state(targets, y_pred)
m.result().numpy()

### From all predicted ones, <br>how many of them are the real one?

In [None]:
m = tf.keras.metrics.Precision()
m.update_state(targets, y_pred)
m.result().numpy()