In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Load data
df1 = pd.read_csv("./assignment_1.4.csv")



In [3]:
df1.head()

Unnamed: 0,genre,description
0,horror,When six friends fly off on a weekend getaway...
1,horror,The story is about a young girl who was touch...
2,romance,A young woman named Anna has always longed fo...
3,horror,A London couple moves to a large country hous...
4,horror,"In a small college in North Carolina, only a ..."


In [4]:
# Strip leading and trailing spaces from the genre column
df1['genre'] = df1['genre'].str.strip()

# Map genre to numerical values
df1['genre_numerical'] = df1.genre.map({'horror': 1, 'romance': 0})

# Display the first few rows of the DataFrame to verify the mapping
df1.head()


Unnamed: 0,genre,description,genre_numerical
0,horror,When six friends fly off on a weekend getaway...,1
1,horror,The story is about a young girl who was touch...,1
2,romance,A young woman named Anna has always longed fo...,0
3,horror,A London couple moves to a large country hous...,1
4,horror,"In a small college in North Carolina, only a ...",1


In [5]:

# Load the Word2Vec model
repo_id = "NathaNn1111/word2vec-google-news-negative-300-bin"
filename = "GoogleNews-vectors-negative300.bin"
model_path = hf_hub_download(repo_id=repo_id, filename=filename)
word2vec = KeyedVectors.load_word2vec_format(model_path, binary=True)


In [7]:

# Load spacy model
nlp = spacy.load("en_core_web_lg")


In [8]:

# Function to create mean vector for a description
def description_to_vector(description):
    tokens = [token.text.lower() for token in nlp(description) if token.is_alpha]
    vectors = [word2vec[word] for word in tokens if word in word2vec]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word2vec.vector_size)


In [9]:

# Generate mean vectors for all descriptions
tqdm.pandas()
df1['vector'] = df1['description'].progress_apply(description_to_vector)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1344/1344 [00:39<00:00, 34.23it/s]


In [10]:

# Prepare data for training
X = np.stack(df1['vector'].values)
y = df1['genre_numerical'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=424)

# Build a simple neural network
def create_model():
    model_new = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model_new.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model_new

model_new = create_model()

# Use early stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model with validation set
history = model_new.fit(X_train, y_train, epochs=500, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
y_pred = (model_new.predict(X_test) > 0.5).astype(int).flatten()
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/500
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.5515 - loss: 0.6857 - val_accuracy: 0.8651 - val_loss: 0.6425
Epoch 2/500
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7537 - loss: 0.6317 - val_accuracy: 0.9209 - val_loss: 0.5218
Epoch 3/500
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.8390 - loss: 0.5114 - val_accuracy: 0.9116 - val_loss: 0.3674
Epoch 4/500
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9097 - loss: 0.3610 - val_accuracy: 0.9209 - val_loss: 0.2585
Epoch 5/500
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9203 - loss: 0.2575 - val_accuracy: 0.9209 - val_loss: 0.2376
Epoch 6/500
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9167 - loss: 0.2232 - val_accuracy: 0.9302 - val_loss: 0.2036
Epoch 7/500
[1m27/27[0m [32m━━

In [11]:

# Function to preprocess and predict genre for a new description
def predict_genre(description, model_new, word2vec, nlp):
    # Tokenize and create a mean vector for the description
    tokens = [token.text.lower() for token in nlp(description) if token.is_alpha]
    vectors = [word2vec[word] for word in tokens if word in word2vec]
    if vectors:
        mean_vector = np.mean(vectors, axis=0)
    else:
        mean_vector = np.zeros(word2vec.vector_size)
    
    # Predict genre
    prediction = model_new.predict(mean_vector.reshape(1, -1))[0][0]
    genre = "horror" if prediction > 0.5 else "romance"
    confidence = prediction if genre == "horror" else 1 - prediction
    return genre, confidence

# Example reviews for inference
example_description = [
    "A couple’s dream home hides a ghostly bride who longs to experience love once more, pulling them into her unfinished story.",
    "A woman falls for a charming man, only to discover he harbors a dark secret—his love for her might be the only thing keeping his curse at bay.",
    "Newlyweds take shelter in a forgotten mansion, where a lonely ghost tries to relive her love through them. But will she ever let them go?",
    "A grieving woman begins receiving love letters from her late husband, leading her to a bittersweet choice—hold onto his spirit or move on.",
    "A musician falls for a mysterious woman who only appears at night, their love growing stronger even as he senses she’s slipping away.",
    "A man is drawn to a mesmerizing woman, unaware that loving her means embracing an eternal, intoxicating darkness."
]

# Run inference on example reviews
for description in example_description:
    genre, confidence = predict_genre(description, model_new, word2vec, nlp)
    print(f"description: {description}\nPredicted genre: {genre} (Confidence: {confidence:.2f})\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
description: A couple’s dream home hides a ghostly bride who longs to experience love once more, pulling them into her unfinished story.
Predicted genre: horror (Confidence: 0.69)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
description: A woman falls for a charming man, only to discover he harbors a dark secret—his love for her might be the only thing keeping his curse at bay.
Predicted genre: horror (Confidence: 0.57)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
description: Newlyweds take shelter in a forgotten mansion, where a lonely ghost tries to relive her love through them. But will she ever let them go?
Predicted genre: horror (Confidence: 0.71)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
description: A grieving woman begins receiving love letters from her late husband, leading her to a bittersweet choice—hold onto his spi

In [12]:
model_new.save('C:/Users/saima/OneDrive/Desktop/Genre_Prediction/prediction.keras')

In [13]:
import pickle

# Save Word2Vec
word2vec.save("word2vec.model")

# Save Tokenizer
nlp.to_disk("spacy_model")
