# Jigsaw Rate Severity - Simple LSTM

**Work:**
 - Forked https://www.kaggle.com/elcaiseri/jigsaw-keras-embedding-lstm
 - Revised data prep and model architecture to run with single input (text) and get single score (relative severity of toxicity)
     - Target is created by using the (less) and (more) information to assign a value that adheres to all (less) and (more) information
 - Revised optimizer and manually tuned learning rate for better performance
 - Added text augmentation

**References and Acknowledgements:**
 - https://www.kaggle.com/elcaiseri/jigsaw-keras-embedding-lstm
 - https://www.kaggle.com/elcaiseri
 - https://www.kaggle.com/c/jigsaw-toxic-severity-rating/overview
 - https://github.com/tensorflow/tensorflow/issues/38613
 - https://www.kaggle.com/yeayates21/commonlit-text-augmentation-eng-to-fre-to-eng/notebook

## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from random import sample
import time

import os
from tqdm.notebook import tqdm

import tensorflow as tf
import keras.backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Concatenate
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers.merge import concatenate

from sklearn.model_selection import train_test_split
from textblob import TextBlob

## Data Wrangling

In [None]:
PATH = '/kaggle/input/jigsaw-toxic-severity-rating/'
valid_data = pd.read_csv(PATH + 'validation_data.csv')
comment_data = pd.read_csv(PATH + 'comments_to_score.csv')
sub = pd.read_csv(PATH + 'sample_submission.csv')

In [None]:
valid_data.sort_values('worker', inplace=True)
valid_data.head()

In [None]:
valid_data.values.shape

## Quick EDA

Can text be found more than once in either column?  - Answer: Yes

In [None]:
txteg = valid_data.values[0,2] # get text example from more_toxic
valid_data[valid_data['less_toxic']==txteg].head() # look for example in less_toxic

## Get Training Data

 - Data created here: https://www.kaggle.com/yeayates21/jigsaw-rate-severity-text-augmentation/notebook
 - Creating data outside notebook reduces runtime and allows for internet access for language augmentation

In [None]:
training_data = pd.read_csv("../input/jigsaw-rate-severity-text-augmentation/jigsaw_rate_severity_training_data.csv")
toxic_text = training_data['text'].values
target = training_data['target'].values

In [None]:
print("Text list length: ", len(toxic_text))
print("Target list length: ", len(target))

In [None]:
plt.hist(target, label='training target distribution');
plt.legend();

### Text Preprocessing for Deep Learning

In [None]:
MAX_LENGTH = 512

# tokenize the sentences
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(toxic_text)

text_seq = tokenizer.texts_to_sequences(toxic_text)

# pad the sequences
text_vec = pad_sequences(text_seq, maxlen=MAX_LENGTH)

text_vec.shape

In [None]:
print('Number of Tokens:', len(tokenizer.word_index))

## Model

In [None]:
x_input = Input(shape=(MAX_LENGTH,))

x = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100,)(x_input)

#x = LSTM(units=128, return_sequences=True)(x)
#x = Dropout(0.2)(x)

x = LSTM(units=64, return_sequences=False)(x)
x = Dropout(0.2)(x)

x = Dense(64, activation='relu')(x)
x = Dropout(0.25)(x)

outputs = Dense(1)(x)

model = Model(inputs=x_input, outputs=outputs)

model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0025))

In [None]:
tf.keras.utils.plot_model(
    model,
    to_file="model.png",
    show_shapes=False,
    show_dtype=False,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=False,
    dpi=96,
    layer_range=None,
)

In [None]:
# Model hyperparameters 
BATCH_SIZE = 256
EPOCHS = 20

# model drive
cp_file = './lstm_model.h5'
cp = ModelCheckpoint(cp_file, 
                     monitor='loss', 
                     verbose=0, 
                     save_best_only=True, mode='min')

es = EarlyStopping(patience=3, 
                   monitor='loss', 
                   #restore_best_weights=True, 
                   mode='min', 
                   verbose=1)

# model train
history = model.fit(text_vec, target,
                    batch_size=BATCH_SIZE, 
                    epochs=EPOCHS,
                    validation_split=0.1,
                    callbacks=[es, cp],
                    shuffle=True,
                    )

In [None]:
pd.DataFrame(history.history).plot(figsize=(12, 6));

## Prediction

In [None]:
test_ids = comment_data['comment_id']
test_text = comment_data['text']

test_text_seq = tokenizer.texts_to_sequences(test_text)

# pad the sequences
test_text_vec = pad_sequences(test_text_seq, maxlen=MAX_LENGTH)

In [None]:
test_length = len(test_text_vec)

preds = model.predict(test_text_vec)

In [None]:
plt.hist(preds, label='test prediction distribution');
plt.legend();

In [None]:
sub['score'] = preds
sub['score'] = sub['score'].rank(method='first')

In [None]:
sub.to_csv('submission.csv', index=False)

In [None]:
sub