# LSTM model for NLP

We want for a given text, we want to rate the reading complexity of this one. Here, we face a regression problem.

In this kernel, we are going to use an LSTM model in order to predict the complixity of the current text.

Don't hesitate if have question or if you see some improvement that can be made.

In [None]:
import numpy as np 
import pandas as pd 
import tensorflow as tf

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, Input, Dropout, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.layers.wrappers import TimeDistributed
from tensorflow.python.keras.layers.recurrent import LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import RootMeanSquaredError
from sklearn.model_selection import train_test_split
from os import path

In [None]:
df_train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
df_test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
df_sample = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

# Visualize some data

In [None]:
df_train['excerpt'][0]

In [None]:
df_train = df_train.drop(columns=['url_legal', 'license'])
df_train.head()

In [None]:
df_test = df_test.drop(columns=['url_legal', 'license'])
df_test.head()

In [None]:
# Max length of phrases - Useful for the input of our model.

max_length_training = max(df_train.apply(lambda x : len(x["excerpt"]), axis=1))
max_length_testing = max(df_test.apply(lambda x : len(x["excerpt"]), axis=1))

print(max_length_training, max_length_testing)

# Preprocess the data

In order to preprocess the data, we are going to :

- Word tokenize : we want to break down the sentence to get the words that compose it.
- To lower case : normalize each word.
- Remove punctuations/digits.
- (optional) Remove stopwords : remove non significative words.
- (optional) Stemming : get the word stem, the root form of the word. (Example : fishing, fished, fisher => fish)
- Lemmatized : Get the lemma of the word.

In this approach, I wanted to keep the context of the phrase and also the sequences of words used. 
I think that the complexity of the phrase is also referred by all the words used.
That for this reason, I didn't remove the stopwords, but maybe I wrong on this one.
Also, I prefer to keep the lemmatization instead of the stemming.

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    
    # Extract all the words in the phrase : get a list 
    tokens = word_tokenize(text)
    
    # Lowercase the words
    tokens = [word.lower() for word in tokens]
    
    # Remove all tokens that are not alphabetic
    words = [word for word in tokens if word.isalpha()]
    
    # Remove word in the stop word
    # words = [word for word in words if not word in stop_words]

    # Get the root of the word 
    # stemmed = [porter.stem(word) for word in words]
    
    # Lematize the word
    lematized = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(lematized)

In [None]:
df_train['preprocess_text'] = df_train.excerpt.apply(preprocess_text)
df_test['preprocess_text'] = df_test.excerpt.apply(preprocess_text)

In [None]:
unique_words = list(df_train.preprocess_text.str.split(' ', expand=True).stack().unique())

In [None]:
print(len(unique_words))

In [None]:
df_train.head()

## Tokenize our text

In [None]:
MAX_INPUT_LENGTH = max(max_length_training, max_length_testing)
DICTIONARY_SIZE = len(unique_words)
EMBEDDING_SIZE = 100

In [None]:
# For each text, we fit them to Tokenizer.
tokenizer = Tokenizer(num_words=DICTIONARY_SIZE, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_train['preprocess_text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Then, we 
X = tokenizer.texts_to_sequences(df_train['preprocess_text'].values)
X = pad_sequences(X, maxlen=MAX_INPUT_LENGTH)
print('Shape of data tensor:', X.shape)

X_test = tokenizer.texts_to_sequences(df_test['preprocess_text'].values)
X_test = pad_sequences(X_test, maxlen=MAX_INPUT_LENGTH)

# Get the value 
Y = df_train['target'].values

X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.1, random_state=42)

# Do a simple LSTM model

With this LSTM model, we are going to stack two bidirectional LSTM.
Regarding the output, as we want to return a rate, we put a single neuron with a linear activation function.

In [None]:
def create_lstm_model():
    model = Sequential()
    model.add(Input(shape=(MAX_INPUT_LENGTH,)))
    
    model.add(Embedding(DICTIONARY_SIZE, EMBEDDING_SIZE))
    model.add(SpatialDropout1D(0.2))
    
    model.add(Bidirectional(LSTM(128, recurrent_dropout=0.2, return_sequences=True)))
    model.add(Dropout(0.5))
    
    model.add(Bidirectional(LSTM(128)))
    model.add(Dropout(0.5))
    
    model.add(Dense(1, activation='linear'))
    
    # Possibly of changing the learning rate of Adam
    model.compile(loss="mean_squared_error", optimizer=Adam(), metrics=['mse', 'mae', RootMeanSquaredError()])
    
    return model

model = create_lstm_model()
model.summary()

In [None]:
from tensorflow.keras.callbacks import LearningRateScheduler

MODEL_PATH = './bi_lstm_stack'

def lr_time_based_decay(epoch, lr):
    decay_rate = 0.1
    return lr * 1 / (1 + decay_rate * epoch)

if path.exists(MODEL_PATH):
    print("[*] Load pretrained model !")
    
    model = keras.models.load_model(MODEL_PATH)
else:
    print("[*] Train the model !")
    
    model.fit(
        X_train, 
        y_train,
        validation_data=(X_val, y_val),
        batch_size=16,
        epochs=8,
        verbose=1,
        callbacks=[LearningRateScheduler(lr_time_based_decay, verbose=1)]
    )
    
    model.save(MODEL_PATH)

In [None]:
y_pred = model.predict(X_test)

df_sample['target'] = y_pred

df_sample.to_csv("submission.csv", index=False)