# Import the libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, GlobalMaxPool1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

# Step 1:  Load The Data

In [None]:
with open('/content/drive/MyDrive/movie genere/Genre Classification Dataset/train_data.txt', 'r', encoding='utf-8') as train_file:
    train_lines_new = train_file.readlines()
    train_data = [line.strip().split(' ::: ') for line in train_lines_new]

with open('/content/drive/MyDrive/movie genere/Genre Classification Dataset/test_data.txt', 'r', encoding='utf-8') as test_file:
    test_lines_new = test_file.readlines()
    test_data = [line.strip().split(' ::: ') for line in test_lines_new]


In [None]:
with open('/content/drive/MyDrive/movie genere/Genre Classification Dataset/description.txt', 'r', encoding='utf-8') as desc_file:
    descriptions = desc_file.readlines()

Combine train_data and test_data descriptions


In [None]:
all_descriptions = [item[3] for item in train_data] + [item[2] for item in test_data] + descriptions

Combine train_data and test_data labels

In [None]:
labels = [item[2] for item in train_data] + [None for _ in test_data]

# Step 2:  Data pre-processing

Tokenize text data

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(all_descriptions)

Convert text data to sequences: Converting text to sequences is essential for machine learning and deep learning as it provides a numerical representation of text data, maintains order, enables fixed-length inputs, and facilitates efficient processing and feature extraction

In [None]:
sequences = tokenizer.texts_to_sequences(all_descriptions)

In [None]:
# Pading sequences to a fixed length
max_sequence_length = 100  # adjusting this based on our data
sequences = pad_sequences(sequences, maxlen=max_sequence_length)


 Encode labels

In [None]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

Split sequences back into train and test parts

In [None]:
train_sequences = sequences[:len(train_data)]
test_sequences = sequences[len(train_data):len(train_data) + len(test_data)]

# Step 3:  Model training

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_sequences, encoded_labels[:len(train_data)], test_size=0.2, random_state=42)

constructing a sequentail model

In [None]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

compileing the model

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

defining call backs to save the model

In [None]:
checkpoint = ModelCheckpoint('movie_genre_classifier_model.h5', monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)
callbacks_list = [checkpoint]


model training

In [None]:
batch_size = 32
epochs = 10
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=batch_size, epochs=epochs, callbacks=callbacks_list)

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.50890, saving model to movie_genre_classifier_model.h5
Epoch 2/10
  11/1356 [..............................] - ETA: 15s - loss: 1.6147 - accuracy: 0.5170

  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.50890 to 0.55132, saving model to movie_genre_classifier_model.h5
Epoch 3/10
Epoch 3: val_accuracy improved from 0.55132 to 0.55261, saving model to movie_genre_classifier_model.h5
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.55261
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.55261
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.55261
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.55261
Epoch 8/10
Epoch 8: val_accuracy did not improve from 0.55261
Epoch 9/10
Epoch 9: val_accuracy did not improve from 0.55261
Epoch 10/10
Epoch 10: val_accuracy did not improve from 0.55261


In [None]:
from tensorflow.keras.models import load_model

model.save('/content/drive/MyDrive/SAVED MODELS 1/moviegenere.h5')

NameError: ignored

In [None]:
from tensorflow.keras.models import load_model

# Step 4: Load the Trained Model and Make Predictions

In [None]:
loaded_model = load_model('/content/drive/MyDrive/SAVED MODELS 1/moviegenere.h5')

In [None]:
# Test the model with a sample movie summary
movie_summary = [" In The Secret Garden, a young girl named Mary, who is orphaned and lonely, discovers a hidden, neglected garden on her uncles estate. As she works to revive the garden, she also begins to heal emotionally and make new friends. This heartwarming story explores themes of growth, friendship, and the transformative power of nature."]
movie_summary_sequence = tokenizer.texts_to_sequences(movie_summary)
movie_summary_padded = pad_sequences(movie_summary_sequence, maxlen=max_sequence_length)
predicted_label = loaded_model.predict(movie_summary_padded)

predicted_genre = label_encoder.inverse_transform([np.argmax(predicted_label)])
print(f"Predicted Genre: {predicted_genre[0]}")

Predicted Genre: short


In [None]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Load the trained model
loaded_model = load_model('/content/drive/MyDrive/SAVED MODELS 1/moviegenere.h5')

# Define a sample movie summary for testing
movie_summary = ["A young wizard learns magic and battles dark forces."]

# Tokenize and pad the movie summary
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(movie_summary)
movie_summary_sequence = tokenizer.texts_to_sequences(movie_summary)
max_sequence_length = 100  # Use the same value as used during training
movie_summary_padded = pad_sequences(movie_summary_sequence, maxlen=max_sequence_length)

# Make predictions with the loaded model
predicted_label = loaded_model.predict(movie_summary_padded)

# Inverse transform the label to get the predicted genre
predicted_genre = label_encoder.inverse_transform([np.argmax(predicted_label)])

print(f"Predicted Genre: {predicted_genre[0]}")


Predicted Genre: short
