## Introduction

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

import tensorflow as tf
import keras.backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Concatenate
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers.merge import concatenate

from sklearn.model_selection import train_test_split

## Data Wrangling

In [None]:
PATH = '/kaggle/input/jigsaw-toxic-severity-rating/'
valid_data = pd.read_csv(PATH + 'validation_data.csv')
comment_data = pd.read_csv(PATH + 'comments_to_score.csv')
sub = pd.read_csv(PATH + 'sample_submission.csv')

In [None]:
valid_data.sort_values('worker', inplace=True)
valid_data.head()

In [None]:
valid_data.tail()

In [None]:
valid_data['less_toxic'][0]

In [None]:
#df['less_toxic'][-3:]

In [None]:
#df.drop_duplicates(['less_toxic'], ignore_index=True)

## Data Preprocessing

In [None]:
df = valid_data.copy()
df['target'] = 0.9

In [None]:
MAX_TEXT_LENGTH = 1024*2
MAX_FEATURES = 512
MAX_LENGTH = MAX_FEATURES

In [None]:
less_toxic = df['less_toxic']
more_toxic = df['more_toxic']
toxic_text = less_toxic.append(more_toxic)

target = df['target']

### Text Preprocessing

In [None]:
# tokenize the sentences
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(toxic_text)

less_text_seq = tokenizer.texts_to_sequences(less_toxic)
more_text_seq = tokenizer.texts_to_sequences(more_toxic)

# pad the sequences
less_text_vec = pad_sequences(less_text_seq, maxlen=MAX_LENGTH)
more_text_vec = pad_sequences(more_text_seq, maxlen=MAX_LENGTH)

less_text_vec.shape, more_text_vec.shape

In [None]:
print('Number of Tokens:', len(tokenizer.word_index))
print("Max Token Index:", less_text_vec.max())
print("Max Token Index:", more_text_vec.max(), "\n")

## Loss Metrics 

In [None]:
# https://github.com/keras-team/keras/issues/910#issuecomment-218748553
def margin_ranking_loss(y_true, y_pred, margin=0.9): # change to 1.0?  makes more sense for normalized cosine distance [-1,1]
    ''' This only works when y_true and y_pred are stacked in a way so that
    the positive examples take up the first n/2 rows, and the corresponding negative samples
    take up the last n/2 rows.

    y_true corresponds to scores (e.g., inner products)
    y_pred corresponds is a vector of ones or zeros (denoting positive or negative sample)
    '''
    
    #y_true, y_pred = y_true.astype('float'), y_pred.astype('float')
    n = len(y_true)//2
    signed = y_pred * y_true # make y_true part of the computational graph
    pos = signed[:n]
    neg = signed[n:]
    # negative samples are multiplied by -1, so that the sign in the rankSVM objective is flipped
    hinge_loss = K.relu( margin - pos - neg )
    loss_vec = K.concatenate([hinge_loss, hinge_loss], axis=0) 
    return loss_vec


## Model

In [None]:
x1_input = Input(shape=(MAX_LENGTH,))
x2_input = Input(shape=(MAX_LENGTH,))

x1 = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100,)(x1_input)
x2 = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100,)(x2_input)

x = concatenate([x1, x2])

x = LSTM(units=128, return_sequences=True)(x)
x = Dropout(0.2)(x)

x = LSTM(units=64, return_sequences=False)(x)
x = Dropout(0.2)(x)

x = Dense(64, activation='relu')(x)
x = Dropout(0.25)(x)

outputs = Dense(1)(x)

model = Model(inputs=[x1_input, x2_input], outputs=outputs)

model.compile(loss='mse', optimizer='adam') #, metrics=[margin_ranking_loss])

In [None]:
tf.keras.utils.plot_model(
    model,
    to_file="model.png",
    show_shapes=False,
    show_dtype=False,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=False,
    dpi=96,
    layer_range=None,
)

In [None]:
# Model hyperparameters 
BATCH_SIZE = 256
EPOCHS = 10

# model drive
cp_file = './lstm_model.h5'
cp = ModelCheckpoint(cp_file, 
                     monitor='loss', 
                     verbose=0, 
                     save_best_only=True, mode='min')

es = EarlyStopping(patience=5, 
                   monitor='loss', 
                   #restore_best_weights=True, 
                   mode='min', 
                   verbose=1)

# model train
history = model.fit([less_text_vec, more_text_vec], target,
                    batch_size=BATCH_SIZE, 
                    epochs=EPOCHS,
                    validation_split=0.1,
                    callbacks=[es, cp],
                    shuffle=True,
                    )

In [None]:
#pd.DataFrame(history.history).plot(figsize=(12, 6));

## Prediction

In [None]:
test_ids = comment_data['comment_id']
test_text = comment_data['text']

test_text_seq = tokenizer.texts_to_sequences(test_text)

# pad the sequences
test_text_vec = pad_sequences(test_text_seq, maxlen=MAX_LENGTH)

In [None]:
test_length = len(test_text_vec)

preds0 = model.predict([test_text_vec, test_text_vec])
preds1 = model.predict([test_text_vec, more_text_vec[:test_length]])
preds2 = model.predict([less_text_vec[:test_length], test_text_vec])

In [None]:
plt.hist(preds0, label='test-test');
plt.hist(preds1, label='test-more');
plt.hist(preds2, label='less-test');

plt.legend();

In [None]:
preds = np.mean([preds0, preds1, preds2], axis=0)
plt.hist(preds, label='test-less');

In [None]:
print(f"Total Predictiions: {preds.shape[0]}")
print(f"Total Unique Predictions: {np.unique(preds0).shape[0]}")
print(f"Total Unique Predictions: {np.unique(preds1).shape[0]}")
print(f"Total Unique Predictions: {np.unique(preds2).shape[0]}")
print(f"Total Unique Predictions: {np.unique(preds).shape[0]}")

In [None]:
sub['score'] = preds0
sub['score'] = sub['score'].rank(method='first')

In [None]:
sub

In [None]:
sub.to_csv('submission.csv', index=False)

**<center>The Notebook still under modification, Stay Tuned.<br><span style='color:red'>UpVote</span> if you found it interesting, and i am looking for your feedback</center>**