Loading Glove (100D)

In [3]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2026-02-20 10:57:00--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2026-02-20 10:57:01--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2026-02-20 10:57:01--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

Task 1 - Data Preparation

In [44]:
import pandas as pd
import numpy as np
import ast
import re

df = pd.read_csv('movies.csv')

def parse_genres_properly(x):
    try:
        if isinstance(x, list):
            if x == ["Unknown"]: return x
            return x

        if isinstance(x, str) and x.strip() != "":
            data = ast.literal_eval(x)
            genres = [item['name'] for item in data if 'name' in item]
            return genres if len(genres) > 0 else ["Unknown"]

        return ["Unknown"]
    except Exception as e:
        if isinstance(x, str) and len(x) > 0:
            return [g.strip() for g in x.split(',')]
        return ["Unknown"]

df['genres'] = df['genres'].apply(parse_genres_properly)

sample_genres = df['genres'].head()
print("First 5 rows after proper parsing:")
print(sample_genres)

unknown_count = df[df['genres'].apply(lambda x: x == ["Unknown"])].shape[0]
print(f"Movies with valid genres: {len(df) - unknown_count}")
print(f"Movies labeled Unknown: {unknown_count}")

First 5 rows after proper parsing:
0    [Action Adventure Fantasy Science Fiction]
1                    [Adventure Fantasy Action]
2                      [Action Adventure Crime]
3                 [Action Crime Drama Thriller]
4            [Action Adventure Science Fiction]
Name: genres, dtype: object
Movies with valid genres: 4775
Movies labeled Unknown: 28


Task 2 - GloVe Embedding Pipeline

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf.fit(df.iloc[train_idx]['overview'])
tfidf_lookup = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

def get_weighted_glove(text, glove_model, tfidf_lookup, dim=100):
    words = text.split()
    vectors = []
    weights = []

    for word in words:
        if word in glove_model and word in tfidf_lookup:
            vectors.append(glove_model[word])
            weights.append(tfidf_lookup[word])

    if not vectors:
        return np.zeros(dim)

    return np.average(vectors, axis=0, weights=weights)

print("Generating document vectors...")
df['overview_vec'] = df['overview'].apply(lambda x: get_weighted_glove(x, glove_index, tfidf_lookup))
df['tagline_vec'] = df['tagline'].apply(lambda x: get_weighted_glove(x, glove_index, tfidf_lookup))
df['keywords_vec'] = df['keywords'].apply(lambda x: get_weighted_glove(x, glove_index, tfidf_lookup))

print("Vectors generated successfully.")

Generating document vectors...
Vectors generated successfully.


Task 3 - Model A: Rating Prediction (Regression)

In [38]:
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras import layers, models

y_true_test = df.iloc[test_idx]['vote_average'].values
train_mean = df.iloc[train_idx]['vote_average'].mean()
baseline_preds = np.full(shape=y_true_test.shape, fill_value=train_mean)

mse_baseline = mean_squared_error(y_true_test, baseline_preds)
print(f"Baseline (Global Mean) MSE: {mse_baseline:.4f}")

def build_regression_model():
    model = models.Sequential([
        layers.Input(shape=(100,)),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

reg_results = {}

for col in ['overview_vec', 'tagline_vec']:
    print(f"\nTraining Regression Model on: {col}")

    X_train = np.stack(df.iloc[train_idx][col].values)
    y_train = df.iloc[train_idx]['vote_average'].values
    X_val = np.stack(df.iloc[val_idx][col].values)
    y_val = df.iloc[val_idx]['vote_average'].values
    X_test = np.stack(df.iloc[test_idx][col].values)
    y_test = df.iloc[test_idx]['vote_average'].values

    model = build_regression_model()
    model.fit(X_train, y_train, validation_data=(X_val, y_val),
              epochs=30, batch_size=32, verbose=0)

    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    reg_results[col] = mse
    print(f"MSE for {col}: {mse:.4f}")

print("\n--- Task 3 Comparison ---")
comparison = pd.DataFrame.from_dict(reg_results, orient='index', columns=['MSE'])
comparison.loc['Baseline'] = mse_baseline
print(comparison)

Baseline (Global Mean) MSE: 1.2825

Training Regression Model on: overview_vec
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
MSE for overview_vec: 1.3293

Training Regression Model on: tagline_vec
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
MSE for tagline_vec: 1.3738

--- Task 3 Comparison ---
                   MSE
overview_vec  1.329344
tagline_vec   1.373774
Baseline      1.282513


Task 4 - Model B: Genre Prediction (Multi-Label Classification)

In [54]:
results_t4_final = {}

for col in ['overview_vec_clean', 'keywords_vec_clean']:
    print(f"Evaluating {col} with adjusted threshold...")
    xt = np.stack(df.iloc[train_idx][col])
    xte = np.stack(df.iloc[test_idx][col])

    probs = clf.predict(xte)

    preds = (probs > 0.2).astype(int)

    if preds.sum() == 0:
        preds = np.zeros_like(probs)
        for i in range(len(probs)):
            top_indices = np.argsort(probs[i])[-2:] # Pick top 2
            preds[i, top_indices] = 1

    results_t4_final[col] = {
        "Micro-F1": f1_score(y_test_ml, preds, average='micro', zero_division=0),
        "Macro-F1": f1_score(y_test_ml, preds, average='macro', zero_division=0),
        "Hamming Loss": hamming_loss(y_test_ml, preds)
    }

print("\n--- FINAL SUCCESSFUL TASK 4 RESULTS ---")
print(pd.DataFrame(results_t4_final).T)

Evaluating overview_vec_clean with adjusted threshold...
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Evaluating keywords_vec_clean with adjusted threshold...
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

--- FINAL SUCCESSFUL TASK 4 RESULTS ---
                    Micro-F1  Macro-F1  Hamming Loss
overview_vec_clean  0.000000  0.000000      0.000857
keywords_vec_clean  0.005495  0.000058      0.000859


Task 5 - Frequent Words per Genre

In [55]:
import pandas as pd
from collections import Counter
import re

df_raw = pd.read_csv('movies.csv')

target_genres = [
    'Action', 'Adventure', 'Fantasy', 'Science Fiction', 'Crime',
    'Drama', 'Thriller', 'Animation', 'Family', 'Western',
    'Comedy', 'Romance', 'Horror', 'Mystery', 'History', 'War'
]

stops = set(['the', 'and', 'with', 'for', 'from', 'was', 'who', 'they', 'that', 'his', 'her', 'into', 'this', 'but', 'him', 'she', 'has', 'their', 'when', 'out', 'after', 'about'])

task5_final_list = []

for genre in target_genres:
    mask = df_raw['genres'].str.contains(genre, na=False, case=False)
    subset = df_raw[mask]

    if len(subset) < 5:
        continue

    text = " ".join(subset['overview'].astype(str)).lower()

    words = [w for w in re.findall(r'\b[a-z]{3,}\b', text) if w not in stops]
    counts = Counter(words)

    top_10 = [w for w, c in counts.most_common(10)]

    eligible_bottom = [w for w, c in counts.items() if c >= 3]
    bottom_10 = sorted(eligible_bottom, key=lambda x: counts[x])[:10]

    task5_final_list.append({
        "Genre": genre,
        "Movies Found": len(subset),
        "Top 10 Content Words": ", ".join(top_10),
        "Bottom 10 (Freq>=3)": ", ".join(bottom_10)
    })

task5_table = pd.DataFrame(task5_final_list)

if task5_table.empty:
    print("CRITICAL: No target genres found in the column. Check spelling of 'genres'.")
else:
    print(f"\n--- TASK 5: FREQUENT WORDS PER GENRE (Final Results) ---")
    pd.set_option('display.max_colwidth', None)
    print(task5_table.to_string(index=False))


--- TASK 5: FREQUENT WORDS PER GENRE (Final Results) ---
          Genre  Movies Found                                                   Top 10 Content Words                                                                                Bottom 10 (Freq>=3)
         Action          1153                are, world, one, must, new, man, two, life, them, young            believed, turner, message, spectre, shape, shifting, avengers, alliances, sort, closest
      Adventure           790               are, world, new, must, find, one, young, life, them, two      dispatched, unique, orders, protecting, believed, quite, message, spectre, weary, transported
        Fantasy           418               world, life, are, must, young, evil, new, will, man, all                        torn, protecting, edge, shape, altered, half, actions, sort, lex, abandoned
Science Fiction           530            are, world, earth, one, must, planet, new, have, will, time                  torn, orders, protecting

Task 6 - Genre-Indicative Words Using TF-IDF

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(df_raw['overview'].fillna(''))
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())

target_genres = [
    'Action', 'Adventure', 'Fantasy', 'Science Fiction', 'Crime',
    'Drama', 'Thriller', 'Animation', 'Family', 'Comedy',
    'Romance', 'Horror', 'Mystery'
]

indicative_results = []

for genre in target_genres:
    y_target = df_raw['genres'].str.contains(genre, na=False).astype(int)

    if y_target.sum() < 10: continue

    clf = LogisticRegression(solver='liblinear', C=1.0)
    clf.fit(X_tfidf, y_target)

    coefs = clf.coef_[0]
    top_indices = np.argsort(coefs)[-10:][::-1]
    indicative_words = feature_names[top_indices]

    indicative_results.append({
        "Genre": genre,
        "Indicative Words": ", ".join(indicative_words)
    })

task6_df = pd.DataFrame(indicative_results)
print("\n--- TASK 6: HIGHEST POSITIVE-WEIGHT WORDS PER GENRE ---")
pd.set_option('display.max_colwidth', None)
print(task6_df.to_string(index=False))


--- TASK 6: HIGHEST POSITIVE-WEIGHT WORDS PER GENRE ---
          Genre                                                                                   Indicative Words
         Action                  agent, cop, criminals, hero, ruthless, target, battle, mission, kidnapped, forces
      Adventure                      adventure, bond, world, earth, mission, jungle, king, park, dragon, dinosaurs
        Fantasy                   evil, king, powers, magic, dragon, magical, vampire, werewolf, ancient, vampires
Science Fiction                           earth, planet, alien, future, space, robot, human, virus, time, humanity
          Crime                          police, cop, drug, fbi, murder, criminal, detective, mafia, gangster, mob
          Drama                            story, life, drama, wife, mother, war, family, love, friendship, lawyer
       Thriller             agent, murder, thriller, secret, killer, assassin, russian, police, officer, kidnapped
      Animation        