# Version 2 Neural Network (Regularization & Early Stopping)

In [1]:
# imports 
import os
import joblib
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout, Bidirectional, LSTM, BatchNormalization
from keras.callbacks import EarlyStopping

In [4]:
# load preprocessed data 
train_df = pd.read_csv("../../clean_data/train.csv")
val_df = pd.read_csv("../../clean_data/val.csv")

X_train = train_df["cleaned_synopsis"].values
y_train = train_df["genre"].values
X_val = val_df["cleaned_synopsis"].values
y_val = val_df["genre"].values

In [7]:
#Tokenize
max_words = 10000  # vocabulary size
max_len = 200      # max sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

joblib.dump(tokenizer, "../../models/tokenizer.pkl")

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post", truncating="post") #post to keep beginning of sentence rather than end 
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding="post", truncating="post")

In [5]:
# Encode the data 

label_encoder = LabelEncoder()
y_train_enc = to_categorical(label_encoder.fit_transform(y_train))
y_val_enc = to_categorical(label_encoder.transform(y_val))

joblib.dump(label_encoder, "../../models/label_encoder.pkl")

['../../models/label_encoder.pkl']

In [8]:
vocab_size = 10000   # how many unique words you let tokenizer keep
embedding_dim = 128   # size of word vector embeddings 
num_classes = y_train_enc.shape[1] # number of genres

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))
model.add(Bidirectional(LSTM(32))) #reduced LSTM size from 64 -> 32 
model.add(Dropout(0.35))
model.add(Dense(32, activation="relu", kernel_regularizer=regularizers.l2(0.02))) #added L2 regularizers & reduced layer size from 64 -> 32
model.add(BatchNormalization()) #batch normalization 
model.add(Dropout(0.35))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])



In [None]:
#implement early stopping for version 2 
early_stop = EarlyStopping(
    monitor="val_loss",   #what stat to look at     
    patience=3,           #when it stops updating after 3 epochs 
    restore_best_weights=True  #go back to best weights :D
)

In [None]:
model.fit(
    X_train_pad, y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=20,     # from 30 -> 20      
    batch_size=16, 
    callbacks=[early_stop]  #add early stopping in version 2
)


In [None]:
#save model
os.makedirs("../../models", exist_ok=True)
model.save("../../models/nn_genre_model_2.h5")