# Import data

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import string
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
import re

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')



df = pd.read_csv("movies_metadata.csv", low_memory=False)
df = df[["original_title", "overview"]].dropna()
df.head()


[nltk_data] Downloading package punkt_tab to /home/sara/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,original_title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


# BoW

In [6]:
stop_words = set(stopwords.words('english'))

class StemTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`']
    def __init__(self):
        self.stemmer = SnowballStemmer('english')
    def __call__(self, doc):
        doc = doc.lower()
        return [self.stemmer.stem(t) for t in word_tokenize(re.sub("[^a-z' ]", "", doc)) if t not in self.ignore_tokens]

tokenizer=StemTokenizer()
token_stop = tokenizer(' '.join(stop_words))

vectorizer = CountVectorizer(stop_words=token_stop, max_features=500, ngram_range=(1,2), tokenizer=tokenizer)
bow_matrix_count = vectorizer.fit_transform(df['overview'])
df['bow_embedding_count'] = [row.toarray().flatten().tolist() for row in bow_matrix_count]


tfidf = TfidfVectorizer(stop_words=token_stop, max_features=500, ngram_range=(1,2), tokenizer=tokenizer)
bow_matrix_tfidf = tfidf.fit_transform(df['overview'])
df['bow_embedding_tfidf'] = [row.toarray().flatten().tolist() for row in bow_matrix_tfidf]



# word2vec

In [4]:
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip > /dev/null 2>&1

--2025-05-27 12:39:35--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-05-27 12:39:37--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2025-05-27 12:42:23 (4,97 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def better_tokenizer(text):
    if not isinstance(text, str):
        return []
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha()] 
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return tokens


def load_glove_embeddings(path, vocab, embedding_dim):
    glove_embeddings = {}
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = torch.tensor([float(val) for val in values[1:]], dtype=torch.float32)
            glove_embeddings[word] = vector
    weights_matrix = torch.randn(len(vocab), embedding_dim)
    for word, idx in vocab.items():
        if word in glove_embeddings:
            weights_matrix[idx] = glove_embeddings[word]
    return glove_embeddings, weights_matrix

tokenized_overviews = df['overview'].dropna().map(better_tokenizer)

counter = Counter()
for tokens in tokenized_overviews:
    counter.update(tokens)

specials = ["<unk>"]
itos = specials + [word for word, freq in counter.items() if freq >= 2]
stoi = {word: idx for idx, word in enumerate(itos)}

embedding_dim = 100
glove_path = "glove.6B.100d.txt"
glove_embeddings, _ = load_glove_embeddings(glove_path, stoi, embedding_dim)

def get_glove_embedding(text_tokens):
    vectors = [glove_embeddings[token] for token in text_tokens if token in glove_embeddings]
    if not vectors:
        return torch.zeros(embedding_dim) 
    return torch.stack(vectors).mean(dim=0)

embeddings = []
for tokens in tqdm(tokenized_overviews, desc="Computing GloVe embeddings"):
    embeddings.append(get_glove_embedding(tokens))


embeddings = [vec.tolist() for vec in embeddings]

df['glove_embedding'] = embeddings


  return torch._C._cuda_getDeviceCount() > 0
Computing GloVe embeddings: 100%|██████████| 44512/44512 [00:00<00:00, 60898.51it/s]


In [20]:
# df.head()

# DistillBERT

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
model.to(device)
model.eval()

def get_distilbert_embedding(text, tokenizer, model, device):
    if not isinstance(text, str) or text.strip() == "":
        return np.zeros(768) 

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embedding.cpu().numpy()

tqdm.pandas(desc="Computing DistilBERT embeddings")
df['distilbert_embedding'] = df['overview'].progress_apply(
    lambda x: get_distilbert_embedding(x, tokenizer, model, device)
)

Computing DistilBERT embeddings: 100%|██████████| 44512/44512 [21:36<00:00, 34.33it/s]


# Save

In [9]:
df.head()

Unnamed: 0,original_title,overview,bow_embedding_count,bow_embedding_tfidf,glove_embedding,distilbert_embedding
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.020005550235509872, 0.21014796197414398, 0...","[-0.25018153, 0.21354215, 0.24297297, -0.03745..."
1,Jumanji,When siblings Judy and Peter discover an encha...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.031650930643081665, 0.23956623673439026, 0....","[-0.23026538, 0.1257016, 0.22299434, -0.009155..."
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.07700946927070618, 0.17933276295661926, 0....","[-0.074448496, -0.116385005, 0.2242134, -0.036..."
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.08693083375692368, 0.11770206689834595, 0.2...","[0.12376749, 0.08408527, 0.4186853, 0.09127326..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.05611945316195488, 0.0037858595605939627, 0...","[-0.1709905, -0.09648897, 0.42880845, 0.032393..."


In [10]:
# df.to_csv("embeddings.csv", index=False)