## Follow the similar logic as Movie Classifer Demo done in the class.

- Tokenize text using spacy.
- Download the Word2Vec Model
- Vectorize all words in each review.
- Calculate mean vector of the reviews
- Train a Neural Network for classification
- Test the trained neural network with few examples.

In [1]:
import pandas as pd
df = pd.read_csv('Data/data.csv')
df.head()

Unnamed: 0,genre,description
0,horror,When six friends fly off on a weekend getaway...
1,horror,The story is about a young girl who was touch...
2,romance,A young woman named Anna has always longed fo...
3,horror,A London couple moves to a large country hous...
4,horror,"In a small college in North Carolina, only a ..."


In [2]:
import string

def clean_text(text):
    # Convert to lowercase and remove punctuation
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    return text
df['clean_description'] = df['description'].apply(clean_text)
df.head()

Unnamed: 0,genre,description,clean_description
0,horror,When six friends fly off on a weekend getaway...,when six friends fly off on a weekend getaway...
1,horror,The story is about a young girl who was touch...,the story is about a young girl who was touch...
2,romance,A young woman named Anna has always longed fo...,a young woman named anna has always longed fo...
3,horror,A London couple moves to a large country hous...,a london couple moves to a large country hous...
4,horror,"In a small college in North Carolina, only a ...",in a small college in north carolina only a s...


In [3]:
genre_counts = df['genre'].value_counts()
print("Genre counts:")
print(genre_counts)

Genre counts:
genre
horror      672
romance     672
Name: count, dtype: int64


In [4]:
import spacy

nlp = spacy.load('en_core_web_sm')
def spacy_tokenizer(text):
    doc = nlp(clean_text(text))
    # Remove stop words and keep only meaningful tokens
    tokens = [token.text for token in doc if not token.is_stop and not token.is_space and len(token.text) > 1]
    return tokens

# Apply tokenizer to the 'cleaned_description' column
df['tokenized_text'] = df['clean_description'].apply(spacy_tokenizer)

df.head()




Unnamed: 0,genre,description,clean_description,tokenized_text
0,horror,When six friends fly off on a weekend getaway...,when six friends fly off on a weekend getaway...,"[friends, fly, weekend, getaway, suddenly, pla..."
1,horror,The story is about a young girl who was touch...,the story is about a young girl who was touch...,"[story, young, girl, touch, spirit, caused, de..."
2,romance,A young woman named Anna has always longed fo...,a young woman named anna has always longed fo...,"[young, woman, named, anna, longed, love, fail..."
3,horror,A London couple moves to a large country hous...,a london couple moves to a large country hous...,"[london, couple, moves, large, country, house,..."
4,horror,"In a small college in North Carolina, only a ...",in a small college in north carolina only a s...,"[small, college, north, carolina, select, stud..."


In [5]:
type(df['tokenized_text'].tolist())

list

In [6]:
import torch
import torch.nn as nn
import numpy as np
from gensim.models import Word2Vec
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from sklearn.preprocessing import LabelEncoder

vector_size = 100  # Define embedding size

# Ensure tokenized_text column is a list of lists
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: x if isinstance(x, list) else [])

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokenized_text'].tolist(), vector_size=vector_size, window=5, min_count=1, workers=4)

# Convert tokens to vectors
review_word_vectors = []
sequence_lengths = []

for tokens in df['tokenized_text']:
    vectors = []
    for token in tokens:
        if token in word2vec_model.wv:
            vectors.append(word2vec_model.wv[token])
        else:
            vectors.append(np.zeros(vector_size))
    
    if not vectors:  # Handle empty sequences
        vectors.append(np.zeros(vector_size)) 
    
    sequence_lengths.append(len(vectors))
    review_word_vectors.append(torch.FloatTensor(vectors))

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['genre'])

# Pad sequences in batch
X = pad_sequence(review_word_vectors, batch_first=True)
y = torch.LongTensor(encoded_labels)

print("Number of unique classes:", len(label_encoder.classes_))
print("Classes:", label_encoder.classes_)


  review_word_vectors.append(torch.FloatTensor(vectors))


Number of unique classes: 2
Classes: [' horror ' ' romance ']


In [7]:
class ImprovedTextClassifier(nn.Module):
    def __init__(self, input_size=100, hidden_size=128, num_classes=2):
        super(ImprovedTextClassifier, self).__init__()
        
        # Add dropout for regularization
        self.dropout = nn.Dropout(0.3)
        
        # Bi-directional LSTM layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=2, 
                           batch_first=True, bidirectional=True, dropout=0.2)
        
        # Attention mechanism
        self.attention = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1),
            nn.Softmax(dim=1)
        )
        
        # Output layers with batch normalization
        self.bn = nn.BatchNorm1d(hidden_size * 2)
        self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x, lengths):
        # Apply initial dropout
        x = self.dropout(x)
        
        # Pack padded sequence for LSTM
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        
        # Apply attention
        attention_weights = self.attention(output)
        context = torch.sum(attention_weights * output, dim=1)
        
        # Apply batch normalization and final layers
        context = self.bn(context)
        x = torch.relu(self.fc1(context))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [8]:
from sklearn.model_selection import train_test_split

train_idx, temp_idx = train_test_split(range(len(y)), test_size=0.3, random_state=42, stratify=y)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42, stratify=y[temp_idx])

# Split data
X_train, y_train = X[train_idx], y[train_idx]
X_val, y_val = X[val_idx], y[val_idx]
X_test, y_test = X[test_idx], y[test_idx]

train_lengths = [sequence_lengths[i] for i in train_idx]
val_lengths = [sequence_lengths[i] for i in val_idx]
test_lengths = [sequence_lengths[i] for i in test_idx]


In [9]:
import torch.optim.lr_scheduler as lr_scheduler  # Add this import

# Initialize model and training components
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ImprovedTextClassifier(input_size=vector_size, hidden_size=128, num_classes=2).to(device)

# Simple cross entropy loss without weights
criterion = nn.CrossEntropyLoss()

# Optimizer with weight decay
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)

# Learning rate scheduler - fixed import
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

# Training batch function
def train_batch(X_batch, y_batch, lengths_batch):
    model.train()
    optimizer.zero_grad()
    
    X_batch = X_batch.to(device)
    y_batch = y_batch.to(device)
    
    outputs = model(X_batch, lengths_batch)
    loss = criterion(outputs, y_batch)
    
    loss.backward()
    # Gradient clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    
    return loss.item()

# Evaluation function
def evaluate(X_data, y_data, lengths_data):
    model.eval()
    total_loss = 0
    predictions = []
    
    with torch.no_grad():
        for i in range(0, len(X_data), batch_size):
            batch_X = X_data[i:i+batch_size].to(device)
            batch_y = y_data[i:i+batch_size].to(device)
            batch_lengths = lengths_data[i:i+batch_size]
            
            outputs = model(batch_X, batch_lengths)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            predictions.extend(predicted.cpu().numpy())
    
    return total_loss / (len(X_data) // batch_size + 1), np.array(predictions)

# Training parameters
num_epochs = 30
batch_size = 32
best_val_loss = float('inf')
patience = 5
patience_counter = 0

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    # Shuffle training data
    indices = torch.randperm(len(X_train))
    
    for i in range(0, len(X_train), batch_size):
        batch_indices = indices[i:i+batch_size]
        batch_X = X_train[batch_indices]
        batch_y = y_train[batch_indices]
        batch_lengths = [train_lengths[j] for j in batch_indices]
        
        loss = train_batch(batch_X, batch_y, batch_lengths)
        total_loss += loss
    
    # Validation phase
    val_loss, val_predictions = evaluate(X_val, y_val, val_lengths)
    val_accuracy = (val_predictions == y_val.numpy()).mean()
    
    print(f'Epoch {epoch+1}/{num_epochs}:')
    print(f'Training Loss: {total_loss/(len(X_train)//batch_size):.4f}')
    print(f'Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}')
    
    # Learning rate scheduling
    scheduler.step(val_loss)
    
    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        # Save best model
        torch.save(model.state_dict(), 'best_model.pth', _use_new_zipfile_serialization=True)

    else:
        patience_counter += 1
        if patience_counter >= patience:
            print('Early stopping triggered')
            break

# Load best model for testing
model.load_state_dict(torch.load('best_model.pth', weights_only=True, map_location=device))

# Final evaluation on test set
test_loss, test_predictions = evaluate(X_test, y_test, test_lengths)
test_accuracy = (test_predictions == y_test.numpy()).mean()

print(f'\nTest Results:')
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

# Print detailed classification report
from sklearn.metrics import classification_report
print('\nClassification Report:')
print(classification_report(y_test.numpy(), test_predictions, 
                          target_names=label_encoder.classes_))

Epoch 1/30:
Training Loss: 0.7388
Validation Loss: 0.6935, Accuracy: 0.5000
Epoch 2/30:
Training Loss: 0.7123
Validation Loss: 0.6903, Accuracy: 0.5248
Epoch 3/30:
Training Loss: 0.6627
Validation Loss: 0.5966, Accuracy: 0.7327
Epoch 4/30:
Training Loss: 0.6181
Validation Loss: 0.5984, Accuracy: 0.7277
Epoch 5/30:
Training Loss: 0.6161
Validation Loss: 0.5499, Accuracy: 0.7772
Epoch 6/30:
Training Loss: 0.6021
Validation Loss: 0.6196, Accuracy: 0.7426
Epoch 7/30:
Training Loss: 0.6047
Validation Loss: 0.5533, Accuracy: 0.7624
Epoch 8/30:
Training Loss: 0.5897
Validation Loss: 0.4999, Accuracy: 0.7772
Epoch 9/30:
Training Loss: 0.5721
Validation Loss: 0.5176, Accuracy: 0.7871
Epoch 10/30:
Training Loss: 0.5751
Validation Loss: 0.5697, Accuracy: 0.7871
Epoch 11/30:
Training Loss: 0.5682
Validation Loss: 0.5438, Accuracy: 0.7871
Epoch 12/30:
Training Loss: 0.5529
Validation Loss: 0.7104, Accuracy: 0.6931
Epoch 13/30:
Training Loss: 0.5661
Validation Loss: 0.5106, Accuracy: 0.7723
Early st

In [12]:
def predict_example(text, model, word2vec_model, device):
    # Process text with spaCy
    doc = nlp(text)
    token_list = [token.text for token in doc if not token.is_space]
    
    # Vectorize each word
    vectors = []
    for token in token_list:
        if token in word2vec_model.wv:
            vectors.append(word2vec_model.wv[token])
        else:
            # Use mean vector instead of zeros for unknown words
            vectors.append(word2vec_model.wv.vectors.mean(axis=0))
    
    if not vectors:
        vectors.append(np.zeros(vector_size))
    
    # Convert to tensor and move to device
    input_tensor = torch.FloatTensor(vectors).unsqueeze(0).to(device)
    sequence_length = [len(vectors)]
    
    # Model prediction
    model.eval()
    with torch.no_grad():
        output = model(input_tensor, sequence_length)
        probabilities = torch.softmax(output, dim=1)
        confidence, predicted = torch.max(probabilities, 1)
        
        # Get prediction and confidence
        predicted_genre = label_encoder.inverse_transform(predicted.cpu().numpy())
        confidence_score = confidence.item()
    
    return predicted_genre[0], confidence_score

In [13]:
horror_text1 = "In a remote mountain cabin, Sarah discovers an ancient diary that speaks of a malevolent entity lurking in the surrounding woods. As winter storms trap her inside, she begins experiencing terrifying visions and mysterious scratching sounds from within the walls. Each night, the scratching gets closer to her bedroom. The diary reveals a dark history of disappearances spanning centuries, all occurring during the winter solstice - which is just days away. Sarah notices shadowy figures in her peripheral vision and finds strange symbols carved into the cabin's foundation. The local townspeople refuse to speak about the cabin's history, crossing themselves whenever it's mentioned. As the solstice approaches, Sarah uncovers the horrifying truth: the cabin itself is a gateway, and something ancient and hungry is preparing to cross through. With no escape possible, she must unravel the cabin's secrets before she becomes its next victim."
horror_text2 = "Deep beneath the city, a team of urban explorers stumbles upon a network of forgotten Victorian-era tunnels. The discovery seems like an urban explorer's dream until they find disturbing evidence of recent activity - fresh scratch marks on the walls, discarded modern clothing covered in dark stains, and strange symbols painted in what appears to be blood. Their helmet cameras capture glimpses of something moving in the darkness, something that doesn't look entirely human. The deeper they venture, the more they realize they're not alone. The tunnels seem to shift and change behind them, cutting off their escape routes. Their lights begin to fail one by one, and the air grows thick with the scent of decay. The team's excitement turns to terror as they realize they've awakened something that has been waiting in the darkness for over a century. The scratching sounds growing closer are just the beginning."
romance_text1 = "Emma returns to her coastal hometown to sell her late grandmother's flower shop, expecting to stay only a few weeks. But when she runs into her high school sweetheart, Oliver, now a marine biologist studying local tide pools, old feelings begin to resurface. As they work together to save the shop's historic garden from being demolished for a new development, they rediscover their shared love of nature and each other. During late nights restoring the garden's century-old greenhouse, they slowly open up about their paths not taken and dreams deferred. Oliver shows Emma the magic of bioluminescent waves and hidden tidal caves, while she helps him see the beauty in putting down roots. With the garden's spring fundraiser approaching, they must decide if their rekindled connection is strong enough to withstand the forces pulling them in different directions."
romance_text2 = "Chef Sofia's carefully ordered life in Manhattan is thrown into chaos when a charming food critic, James, gives her innovative restaurant a mixed review. Determined to prove him wrong, she challenges him to spend a week cooking alongside her in the kitchen. As they work elbow to elbow creating new dishes, their initial antagonism gives way to mutual respect and attraction. James helps Sofia rediscover the joy of cooking without pretense, while she shows him the passion and artistry behind every dish she creates. During long nights perfecting recipes and early morning visits to farmers' markets, they find themselves sharing more than just cooking techniques. But when a prestigious opportunity in Paris threatens to separate them, they must decide what matters most: their careers or their growing love for each other."

print("Predicted genre for horror_text1:", predict_example(horror_text1, model, word2vec_model, device))
print("Predicted genre for horror_text2:", predict_example(horror_text2, model, word2vec_model, device))
print("Predicted genre for romance_text1:", predict_example(romance_text1, model, word2vec_model, device))
print("Predicted genre for romance_text2:", predict_example(romance_text2, model, word2vec_model, device))

Predicted genre for horror_text1: (' horror ', 0.6898594498634338)
Predicted genre for horror_text2: (' horror ', 0.6314055323600769)
Predicted genre for romance_text1: (' romance ', 0.9601119756698608)
Predicted genre for romance_text2: (' romance ', 0.9418725371360779)


In [14]:
import joblib
import os

def save_model_components(model, word2vec_model, label_encoder, save_dir='model_files'):
    
    # Create directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Save the PyTorch model
    torch.save(model.state_dict(), 
              os.path.join(save_dir, 'model.pth'), 
              _use_new_zipfile_serialization=True)
    
    # Save the Word2Vec model
    word2vec_model.save(os.path.join(save_dir, 'word2vec.model'))
    
    # Save the label encoder
    joblib.dump(label_encoder, os.path.join(save_dir, 'label_encoder.joblib'))
    
    # Save model configuration (if needed)
    model_config = {
        'input_size': vector_size,
        'hidden_size': 128,
        'num_classes': 2
    }
    joblib.dump(model_config, os.path.join(save_dir, 'model_config.joblib'))

# Call this after training
save_model_components(model, word2vec_model, label_encoder)