# Machine Learning Project - Inappropriate Language Classification - LSTM

This notebook is separate from the rest as the embedding layers are directly integrated in the model. That is because the model adds it's own embeddings for the count vectorizer. Furthermore, the LSTM model has a set input size, as such the inputs will be troncated from the ? end / start ?

This Jupyter Notebook contains the following features:
1. Model Choice
    1. Using the Base Embedding Layer
        - Data Tockenisation
        - Model building with embedding layer
    2. Using the GloVe embeddings
        - Data Embedding
        - Model building without embedding layer
2. Model Training
3. Model Testing

In [None]:
#Parameters
max_input_size = 120

## 1. Model choice

### 1. Default Embeddings

In [None]:
#Create Tockenizer
max_words = 10000 # Max number of words to use in the tockenizer

from experiment_baseplate import get_text_data
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(get_text_data())
word_index = tokenizer.word_index
print("Number of known words: ", len(word_index))

In [None]:
from experiment_baseplate import load_split_data

X_train, y_train, X_validate, y_validate, X_test, y_test = load_split_data()

#Tockenize data
from tensorflow.keras.preprocessing.sequence import pad_sequences

def post_process(X_values):
    X_values = tokenizer.texts_to_sequences(X_values)
    return pad_sequences(X_values, maxlen=max_input_size)

X_train = post_process(X_train)
X_test = post_process(X_test)
X_validate = post_process(X_validate)

In [None]:
#Define layers
import tensorflow.keras.layers as tfl

embedding_dim = 200

lstm_layers = [
    tfl.Input(shape=(max_input_size,)),
    tfl.Embedding(max_words, embedding_dim),
    tfl.LSTM(64),
    tfl.Dropout(0.2),
    tfl.Dense(2, activation='softmax')
]

### GloVe Embeddings

In [None]:
'''
If needed download weights
'''
from experiment_baseplate import get_glove_model

get_glove_model()

In [None]:
from experiment_baseplate import get_split_glove_embedding

X_train, y_train, X_validate, y_validate, X_test, y_test = get_split_glove_embedding()

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def post_process(X_values):
    return np.array( pad_sequences(X_values, maxlen=max_input_size) )# , dtype=np.uint8)

X_train = post_process(X_train)
X_test = post_process(X_test)
X_validate = post_process(X_validate)

In [None]:
#Define layers
import tensorflow.keras.layers as tfl

glove_embedding_dim = X_train.shape[2]

lstm_layers = [
    tfl.Input(shape=(max_input_size, glove_embedding_dim)),
    tfl.LSTM(64),
    tfl.Dropout(0.2),
    tfl.Dense(2, activation='softmax')
]

### 2. Model Training

In [None]:
#Build the model
from tensorflow.keras.models import Model

if(len(lstm_layers) < 2):
    print("Not enough layers in your model!")
    exit()

for i in range(1, len(lstm_layers)):
    lstm_layers[i] = lstm_layers[i](lstm_layers[i - 1])


model = Model(inputs=lstm_layers[0], outputs=lstm_layers[-1])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
#Train the model
epochs = 10
batch_size = 64

model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_validate, y_validate))

### Model Testing

In [None]:
from experiment_baseplate import score

print("LSTM Model")
print("Validate values -> " + score( model.predict(X_validate) , y_validate))
print("Test values -> " + score( model.predict(X_test) , y_test))