# Imports

In [None]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
import os
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import callbacks, models, layers
import matplotlib.pyplot as plt

# tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
MAX_WORDS = 25_000

nltk.download('stopwords')
from nltk.corpus import stopwords

# Create train data

The competition was multioutput

We turn it into a binary toxic/ no-toxic classification

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

# Undersample

The dataset is very unbalanced. Here we undersample the majority class. Other strategies might work better.

In [None]:
min_len = (df['y'] == 1).sum()
df_y0_undersample = df[df['y'] == 0].sample(n=min_len, random_state=201)
df = pd.concat([df[df['y'] == 1], df_y0_undersample])
df['y'].value_counts()

# transform the data

In [None]:
# stop_words = stopwords.words("english")
# # lemmatizer = nltk.stem.WordNetLemmatizer()

# # def lemmatize_text(text):
# #     return [lemmatizer.lemmatize(w) for w in text]

# def clean(comment):
#     clean_html = BeautifulSoup(comment).get_text()
#     clean_non_letters = re.sub("[^a-zA-Z]", " ", clean_html)
#     cleaned_lowercase = clean_non_letters.lower()
#     words = cleaned_lowercase.split()
#     cleaned_words = [w for w in words if w not in stop_words]
#     return " ".join(cleaned_words)

# df['cleaned'] = df['text'].apply(clean)
# df

In [None]:
# tokenizer = Tokenizer(num_words=MAX_WORDS)
# tokenizer.fit_on_texts(df.cleaned)
# total_words = len(tokenizer.word_index) + 1
# total_words

In [None]:
# sequences = tokenizer.texts_to_sequences(df.cleaned)
# max_sequence_len = max([len(x) for x in sequences])
# padded_sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))
# labels = np.array(df.y)
# X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=0)
# print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

In [None]:
vec = TfidfVectorizer()
X = vec.fit_transform(df.text)
y = df.y
print(X.shape, y.shape)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

In [None]:
def to_dataset(data, labels):
    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    dataset = dataset.cache().shuffle(X_train.shape[0] + 1).batch(32).prefetch(tf.data.experimental.AUTOTUNE)
    return dataset
train_ds = to_dataset(X_train, y_train)
val_ds = to_dataset(X_val, y_val)

# model

In [None]:
LSTM_SIZE = 4

def lstm_model():
    model = models.Sequential()
    model.add(layers.Embedding(total_words, 2, input_length=max_sequence_len - 1))
    model.add(layers.LSTM(LSTM_SIZE))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
    return model, f'lstm_{LSTM_SIZE}'

In [None]:
def tokenizer_train(model, name):
    reducer = callbacks.ReduceLROnPlateau(monior='val_loss', factor=0.5, patience=3, mode='min', cooldown=1)
    stopper = callbacks.EarlyStopping(monitor='val_loss', patience=6, mode='min', restore_best_weights=True)
    hist = model.fit(train_ds,
              epochs=100,
              verbose=1,
              callbacks=[stopper, reducer],
              validation_data=val_ds)
    results = model.evaluate(val_ds)
#     model.save(f'/kaggle/working/{name}')
    print(f"results: {results}, type: {type(results)}")
    return hist

In [None]:
model, name = lstm_model()
model.summary()

# Training

In [None]:
hist = tokenizer_train(model, name)

In [None]:
fig, axs = plt.subplots(3, 1, figsize=(8,8), tight_layout=True)
    
axs[0].plot(hist.history['loss'])
axs[0].plot(hist.history['val_loss'])
axs[0].set_title('binary_crossentropy Loss')
axs[0].set_ylabel('Loss')
axs[0].set_xlabel('Epoch')
axs[0].legend(['train', 'val'], loc='upper right')

axs[1].plot(hist.history['binary_accuracy'])
axs[1].plot(hist.history['val_binary_accuracy'])
axs[1].set_title('binary_accuracy Metric')
axs[1].set_ylabel('Error')
axs[1].set_xlabel('Epoch')
axs[1].legend(['train', 'val'], loc='upper left')

axs[2].plot(hist.history['lr'])
axs[2].set_title('Learining Rate')
axs[2].set_ylabel('LR')
axs[2].set_xlabel('Epoch')
plt.savefig(f'/kaggle/working/{name}_graphs.png')
plt.show()

# Validate

In [None]:
# df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
# X_less_toxic = tokenizer.texts_to_sequences(df_val['less_toxic'].apply(clean))
# X_less_toxic = np.array(pad_sequences(X_less_toxic, maxlen=max_sequence_len, padding='pre'))

# X_more_toxic = tokenizer.texts_to_sequences(df_val['more_toxic'].apply(clean))
# X_more_toxic = np.array(pad_sequences(X_more_toxic, maxlen=max_sequence_len, padding='pre'))

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
X_less_toxic = vec.transform(df_val['less_toxic'])
X_more_toxic = vec.transform(df_val['more_toxic'])

In [None]:
p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)

In [None]:
p1

In [None]:
p2

In [None]:
# Validation Accuracy
(p1 < p2).mean()

# Submission

In [None]:
# df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
# X_test = tokenizer.texts_to_sequences(df_sub['text'].apply(clean))
# X_test = np.array(pad_sequences(X_test, maxlen=max_sequence_len, padding='pre'))
# p3 = model.predict(X_test)

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
X_test = vec.transform(df_sub['text'])
p3 = model.predict_proba(X_test)

In [None]:
df_sub['score'] = p3

In [None]:
df_sub['score'].count()

In [None]:
# 9 comments will fail if compared one with the other
df_sub['score'].nunique()

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)