# Using Keras Model

In [1]:
# imports 
import os
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout, Bidirectional, LSTM  

In [None]:
# load preprocessed data 
train_df = pd.read_csv("../clean_data/train.csv")
val_df = pd.read_csv("../clean_data/val.csv")

X_train = train_df["cleaned_synopsis"].values
y_train = train_df["genre"].values
X_val = val_df["cleaned_synopsis"].values
y_val = val_df["genre"].values

In [3]:
#Tokenize
max_words = 10000  # vocabulary size
max_len = 200      # max sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post", truncating="post") #post to keep beginning of sentence rather than end 
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding="post", truncating="post")

In [4]:
# Encode the data 

label_encoder = LabelEncoder()
y_train_enc = to_categorical(label_encoder.fit_transform(y_train))
y_val_enc = to_categorical(label_encoder.transform(y_val))

In [5]:
print(X_train_pad.shape)      # Should be (43200, max_len)
print(type(X_train_pad[0][0]))  # Should be <class 'int'>
print(X_train_pad[0][:20])    # First 20 indices of first sample


(43200, 200)
<class 'numpy.int32'>
[ 905    1    2  152 1292 3380  622    1  905    2 1040 3464  523 1351
  255   20   39 1790    1  174]


In [None]:
vocab_size = 10000   # how many unique words you let tokenizer keep
embedding_dim = 64   # size of word vector embeddings
num_classes = y_train_enc.shape[1] # number of genres

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))
#smodel.add(GlobalAveragePooling1D())  # reduce sequence to fixed-size vector
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])



In [7]:
model.fit(X_train_pad, y_train_enc, validation_data=(X_val_pad, y_val_enc), epochs=1)  # just to initialize
model.summary()

[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 51ms/step - accuracy: 0.1620 - loss: 2.1988 - val_accuracy: 0.3191 - val_loss: 1.8948


In [8]:
model.fit(
    X_train_pad, y_train_enc,
    validation_data=(X_val_pad, y_val_enc),
    epochs=30,          
    batch_size=32
)

Epoch 1/30
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 51ms/step - accuracy: 0.3608 - loss: 1.7673 - val_accuracy: 0.3301 - val_loss: 1.8576
Epoch 2/30
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 53ms/step - accuracy: 0.4266 - loss: 1.6054 - val_accuracy: 0.3282 - val_loss: 1.8761
Epoch 3/30
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 56ms/step - accuracy: 0.4724 - loss: 1.4878 - val_accuracy: 0.3270 - val_loss: 1.9420
Epoch 4/30
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 57ms/step - accuracy: 0.5116 - loss: 1.3875 - val_accuracy: 0.3130 - val_loss: 2.0201
Epoch 5/30
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 60ms/step - accuracy: 0.5458 - loss: 1.2896 - val_accuracy: 0.3101 - val_loss: 2.1818
Epoch 6/30
[1m1350/1350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 62ms/step - accuracy: 0.5707 - loss: 1.1993 - val_accuracy: 0.2963 - val_loss: 2.2645
Epoc

<keras.src.callbacks.history.History at 0x1e3cce92390>

In [None]:
#save model
os.makedirs("../models", exist_ok=True)
model.save("../models/keras_genre_model.h5")

