# Imports

In [None]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
import os
import time
import matplotlib.pyplot as plt

from gensim.models import KeyedVectors, FastText

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import callbacks, models, layers

# tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from scipy.stats import rankdata

In [None]:
LSTM_SIZE =  128
# EMB_SIZE = 4
# MAX_WORDS = 25_000

from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
EPOCHS = 100
BATCH_SIZE = 16
DEBUG = False

# Create train data

The competition was multioutput

We turn it into a binary toxic/ no-toxic classification

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

# Undersample

The dataset is very unbalanced. Here we undersample the majority class. Other strategies might work better.

In [None]:
min_len = (df['y'] == 1).sum()
df_y0_undersample = df[df['y'] == 0].sample(n=min_len, random_state=201)
df = pd.concat([df[df['y'] == 1], df_y0_undersample])
if(DEBUG):
    df = df.sample(n=400, random_state=201)
df['y'].value_counts()

# transform the data

In [None]:
stop_words = stopwords.words("english")
class DataPipeline:
    def __init__(self):
        #self.tokenizer = Tokenizer(num_words=MAX_WORDS)
        self.fmodel = FastText.load('../input/jigsaw-regression-based-data/FastText-jigsaw-100D/Jigsaw-Fasttext-Word-Embeddings.bin')
        self.emb_dims = 100
        
    def clean(self, comment):
        clean_html = BeautifulSoup(comment).get_text()
        clean_non_letters = re.sub("[^a-zA-Z]", " ", clean_html)
        cleaned_lowercase = clean_non_letters.lower()
        words = cleaned_lowercase.split()
        cleaned_words = [w for w in words if w not in stop_words]
        return " ".join(cleaned_words)
    
#     def fit(self, clean_text):
#         self.tokenizer.fit_on_texts(clean_text)
#         self.total_words = len(self.tokenizer.word_index) + 1
        
    def fit_transform(self, text):
        sequences = [[self.fmodel.wv[self.clean(txt)]] for txt in text]
#         self.fit(clean_text)
#         sequences = self.tokenizer.texts_to_sequences(clean_text)
#         self.max_sequence_len = max([len(x) for x in sequences])
#         padded_sequences = np.array(pad_sequences(sequences, maxlen=self.max_sequence_len, padding='pre'))
        return np.array(sequences)
    
    def transform(self, text):
        sequences = [[self.fmodel.wv[self.clean(txt)]] for txt in text]
#         sequences = self.tokenizer.texts_to_sequences(clean_text)
#         padded_sequences = np.array(pad_sequences(sequences, maxlen=self.max_sequence_len, padding='pre'))
        return np.array(sequences)

data_pipeline = DataPipeline()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(data_pipeline.fit_transform(df.text), df.y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

In [None]:
def to_dataset(data, labels):
    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    dataset = dataset.cache().shuffle(data.shape[0] + 1).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
    return dataset
simple_train_ds = to_dataset(X_train, y_train)
simple_val_ds = to_dataset(X_val, y_val)

# Simple Model

In [None]:
class SimpleModel(tf.keras.Model):
    def __init__(self, lstm_size):
        super(SimpleModel, self).__init__(name='')
        self.lstm1 = layers.LSTM(lstm_size, return_sequence=True)
        self.lstm2 = layers.LSTM(lstm_size)
        self.dense = layers.Dense(1, activation='sigmoid')
        
    def call(self, input_tensor, training=False):
        x = self.lstm1(input_tensor)
        x = self.lstm2(x)
        return self.dense(x)

In [None]:
simple_optimizer = tf.keras.optimizers.Adam(1e-4)
simple_model = SimpleModel(LSTM_SIZE)
simple_model.compile(loss='binary_crossentropy', optimizer=simple_optimizer, metrics=['binary_accuracy'])
simple_model.build((None, 1, data_pipeline.emb_dims))
simple_model.summary()

In [None]:
simple_reducer = callbacks.ReduceLROnPlateau(monior='val_loss', factor=0.5, patience=2, mode='min', cooldown=1)
simple_stopper = callbacks.EarlyStopping(monitor='val_loss', patience=4, mode='min', restore_best_weights=True)
simple_hist = simple_model.fit(simple_train_ds,
                               epochs=EPOCHS,
                               verbose=1,
                               callbacks=[simple_stopper, simple_reducer],
                               validation_data=simple_val_ds)
results = simple_model.evaluate(simple_val_ds)
print(f"results: {results}, type: {type(results)}")

In [None]:
fig, axs = plt.subplots(3, 1, figsize=(8,8), tight_layout=True)

axs[0].plot(simple_hist.history['loss'])
axs[0].plot(simple_hist.history['val_loss'])
axs[0].set_title('binary_crossentropy Loss')
axs[0].set_ylabel('Loss')
axs[0].set_xlabel('Epoch')
axs[0].legend(['train', 'val'], loc='upper right')

axs[1].plot(simple_hist.history['binary_accuracy'])
axs[1].plot(simple_hist.history['val_binary_accuracy'])
axs[1].set_title('binary_accuracy Metric')
axs[1].set_ylabel('Error')
axs[1].set_xlabel('Epoch')
axs[1].legend(['train', 'val'], loc='upper left')

axs[2].plot(simple_hist.history['lr'])
axs[2].set_title('Learining Rate')
axs[2].set_ylabel('LR')
axs[2].set_xlabel('Epoch')
# plt.savefig(f'/kaggle/working/{name}_graphs.png')
plt.show()

# Complex data generation

In [None]:
# df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
# if(DEBUG):
#     df_val = df_val.sample(n=400)
# X2_train, X2_val, y2_train, y2_val = train_test_split(data_pipeline.transform(df_val.less_toxic), data_pipeline.transform(df_val.more_toxic), test_size=0.2, random_state=0)
# print(X2_train.shape, y2_train.shape, X2_val.shape, y2_val.shape)

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
less_toxic, more_toxic = data_pipeline.transform(df_val.less_toxic), data_pipeline.transform(df_val.more_toxic)

In [None]:
# def to_dataset_complex(data1, data2):
#     dataset = tf.data.Dataset.from_tensor_slices((data1, data2))
#     dataset = dataset.cache().shuffle(data1.shape[0] + 1).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
#     return dataset
# complex_train_ds = to_dataset_complex(X2_train, y2_train)
# complex_val_ds = to_dataset_complex(X2_val, y2_val)

# Complex Model

In [None]:
# def loss_function(lower, upper):
#     return tf.nn.relu(tf.math.subtract(lower, upper))
# #     over_under_err = tf.math.square(tf.nn.relu(tf.math.subtract(lower, upper)))
# #     separation_err = tf.math.square(tf.math.subtract(tf.ones_like(upper), tf.nn.relu(tf.math.subtract(upper, lower))))
# #     return tf.math.add(over_under_err, separation_err)

# def complex_metric(more_severe, less_severe):
#     acc = tf.math.greater(more_severe, less_severe)
#     acc = tf.where(acc, 0.0, 1.0)
#     return acc

In [None]:
# class ComplexModel(tf.keras.Model):
#     def __init__(self, lstm_size):
#         super(ComplexModel, self).__init__(name='')
#         self.lstm1 = layers.LSTM(lstm_size)
#         self.dense = layers.Dense(1, activation='sigmoid')
        
#     def train_step(self, data):
# #         if(len(data) == 3):
# #             X, y, sample_weights = data
# #         else:
# #             X, y = data
# #             sample_weights = None
#         with tf.GradientTape() as tape:
#             less_severe, more_severe = self.call(data, training=True)
#             loss = self.compiled_loss(less_severe, more_severe)
#             grads = tape.gradient(loss, self.trainable_variables)
#         self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
#         self.compiled_metrics.update_state(less_severe, more_severe)
#         return {m.name: m.result() for m in self.metrics}
    
#     def test_step(self, data):
#         less_severe, more_severe = self.call(data, training=False)
#         self.compiled_loss(less_severe, more_severe, regularization_losses=self.losses)        
#         self.compiled_metrics.update_state(less_severe, more_severe)
#         return {m.name: m.result() for m in self.metrics}
    
#     def call(self, input_tensor, training=False):
#         less_severe_tensor, more_severe_tensor = input_tensor
#         less_severe = self.lstm1(less_severe_tensor)
#         less_severe = self.dense(less_severe)
#         more_severe = self.lstm1(more_severe_tensor)
#         more_severe = self.dense(more_severe)
#         return less_severe, more_severe

In [None]:
len(simple_model.weights)
for i in range(len(simple_model.weights)):
    print(simple_model.weights[i].shape)

In [None]:
# complex_optimizer = tf.keras.optimizers.Adam(1e-4)
# complex_model = ComplexModel(LSTM_SIZE)
# complex_model.compile(loss=loss_function, optimizer=complex_optimizer, metrics=[complex_metric])
# complex_model.fit(complex_val_ds, epochs=1, verbose=0);

In [None]:
# len(complex_model.weights)
# for i in range(len(complex_model.weights)):
#     print(complex_model.weights[i].shape)

# complex_model.set_weights(simple_model.get_weights())

In [None]:
# complex_model.evaluate(complex_val_ds)

In [None]:
# complex_reducer = callbacks.ReduceLROnPlateau(monior='val_loss', factor=0.5, patience=2, mode='min', cooldown=1)
# complex_stopper = callbacks.EarlyStopping(monitor='val_loss', patience=4, mode='min', restore_best_weights=True)
# complex_hist = complex_model.fit(complex_train_ds,
#                                  epochs=EPOCHS,
#                                  verbose=1,
#                                  callbacks=[complex_stopper, complex_reducer],
#                                  validation_data=complex_val_ds)
# results = complex_model.evaluate(complex_val_ds)
# print(f"results: {results}, type: {type(results)}")

In [None]:
# fig, axs = plt.subplots(3, 1, figsize=(8,8), tight_layout=True)

# axs[0].plot(complex_hist.history['loss'])
# axs[0].plot(complex_hist.history['val_loss'])
# axs[0].set_title('Custom Loss')
# axs[0].set_ylabel('Loss')
# axs[0].set_xlabel('Epoch')
# axs[0].legend(['train', 'val'], loc='upper right')

# axs[1].plot(complex_hist.history['complex_metric'])
# axs[1].plot(complex_hist.history['val_complex_metric'])
# axs[1].set_title('complex_metric')
# axs[1].set_ylabel('Error')
# axs[1].set_xlabel('Epoch')
# axs[1].legend(['train', 'val'], loc='upper left')

# axs[2].plot(complex_hist.history['lr'])
# axs[2].set_title('Learining Rate')
# axs[2].set_ylabel('LR')
# axs[2].set_xlabel('Epoch')
# # plt.savefig(f'/kaggle/working/{name}_graphs.png')
# plt.show()

In [None]:
val_simple_less, val_simple_more = simple_model(less_toxic).numpy(), simple_model(more_toxic).numpy()
# val_complex_less, val_comples_more = complex_model([X2_val, y2_val])

In [None]:
(val_simple_less < val_simple_more).mean()

In [None]:
# (val_complex_less.numpy() < val_comples_more.numpy()).mean()

In [None]:
# complex_model.evaluate(complex_val_ds)

# Submission

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
X_test = data_pipeline.transform(df_sub.text)

In [None]:
p3 = simple_model(X_test)
df_simple_sub = df_sub.copy()
df_simple_sub['score'] = p3
print(df_simple_sub['score'].count())
print(df_simple_sub['score'].nunique())
df_simple_sub['score']=rankdata(df_simple_sub['score'], method='ordinal') 
df_simple_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)

In [None]:
# p3 = complex_model(X_test)
# df_complex_sub = df_sub.copy()
# df_complex_sub['score'] = p3
# print(df_complex_sub['score'].count())
# print(df_complex_sub['score'].nunique())
# df_complex_sub[['comment_id', 'score']].to_csv("complex_submission.csv", index=False)