In [58]:
import torch
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [59]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

bert_tokenizer = AutoTokenizer.from_pretrained("nomic-ai/modernbert-embed-base")
bert_model = SentenceTransformer("nomic-ai/modernbert-embed-base", device=DEVICE)

In [60]:
import pandas as pd
file_path = "wiki_movie_plots_deduped.csv"
df = pd.read_csv(file_path)

filtered_df = df[df['Release Year'] >= 1980]
filtered_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
9670,1980,Airplane!,American,"Jim Abrahams, Jerry Zucker","Robert Hays, Julie Hagerty, Leslie Nielsen, Pe...",unknown,https://en.wikipedia.org/wiki/Airplane!,"As a parody film, Airplane! tells its story in..."
9671,1980,Alien Dead,American,Fred Olen Ray,Buster Crabbe,unknown,https://en.wikipedia.org/wiki/Alien_Dead,A meteor strikes a houseboat in the swamps nea...
9672,1980,Alligator,American,Lewis Teague,"Robert Forster, Robin Riker",unknown,https://en.wikipedia.org/wiki/Alligator_(film),A teenage girl purchases a baby American allig...
9673,1980,Altered States,American,Ken Russell,"William Hurt, Blair Brown",unknown,https://en.wikipedia.org/wiki/Altered_States,"Edward Jessup is an abnormal psychologist who,..."
9674,1980,American Gigolo,American,Paul Schrader,"Richard Gere, Lauren Hutton, Héctor Elizondo",unknown,https://en.wikipedia.org/wiki/American_Gigolo,"Julian Kaye is a male escort in Los Angeles, w..."


## 1A

In [61]:
plots_top10 = filtered_df['Plot'].head(10).tolist()
titles = filtered_df['Title'].head(10).tolist()

print("10 titles to test:")
titles

10 titles to test:


['Airplane!',
 'Alien Dead',
 'Alligator',
 'Altered States',
 'American Gigolo',
 'Animalympics',
 'Antropophagus[citation needed]',
 'Any Which Way You Can',
 'Atlantic City',
 'Death Watch']

In [62]:
bert_embeddings_top10 = bert_model.encode(plots_top10, normalize_embeddings=False, show_progress_bar=True)

print("Embedding shape:", bert_embeddings_top10.shape)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding shape: (10, 768)


## 1B

In [63]:
bert_max_length = bert_tokenizer.model_max_length
bert_tokens = [len(bert_tokenizer.encode(p)) for p in plots_top10]
bert_no_trunc = sum(tc <= bert_max_length for tc in bert_tokens)

print("Plots embedded without truncation:")
print("Bert:", bert_no_trunc)

Plots embedded without truncation:
Bert: 10


## 1C

In [64]:
import numpy as np
word_counts = [len(p.split()) for p in plots_top10]

print("Word lengths of first 10 plots:", word_counts)
print("Max words:", max(word_counts))
print("Min words:", min(word_counts))
print("Average words:", np.mean(word_counts))

Word lengths of first 10 plots: [355, 50, 559, 519, 400, 419, 669, 389, 381, 601]
Max words: 669
Min words: 50
Average words: 434.2


## 2

In [65]:
plots = filtered_df['Plot'].tolist()

bert_embeddings = bert_model.encode(plots, normalize_embeddings=False, show_progress_bar=True)

Batches:   0%|          | 0/633 [00:00<?, ?it/s]

In [66]:
!pip install faiss-cpu



In [67]:
import faiss

bert_index = faiss.IndexFlatL2(bert_embeddings.shape[1])
bert_index.add(bert_embeddings)
print(f"Bert index: {bert_index.ntotal} vectors added")

Bert index: 20247 vectors added


In [76]:
faiss.write_index(bert_index, "BERT.index")

## 3

In [68]:
queries = [
    "Romantic comedies set in New York City",
    "Action movies set in the future",
    "Family-friendly movies about animals"
]

titles = filtered_df['Title'].tolist()
genres = filtered_df['Genre'].tolist()

In [69]:
def build_faiss_indices(embeddings, use_ivf=False, nlist=100):
  d = bert_embeddings.shape[1]
  if not use_ivf:
    index = faiss.IndexFlatIP(d)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
  else:
    faiss.normalize_L2(embeddings)
    quantizer = faiss.IndexFlatIP(d)
    index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
    index.train(embeddings)
    index.add(embeddings)
  return index

In [70]:
bert_index_exact = build_faiss_indices(bert_embeddings, use_ivf=False)
bert_index_ivf = build_faiss_indices(bert_embeddings, use_ivf=True, nlist=100)

In [71]:
def search_top_k(index, query_embedding, k=7):
  faiss.normalize_L2(query_embedding)
  D, I = index.search(query_embedding, k)
  return D, I

In [72]:
bert_query_embedding = np.array(bert_model.encode(queries, convert_to_tensor=False), dtype=np.float32)
topk = 7
results = {}

for i, query in enumerate(queries):
  results[query] = {}

  D_exact, I_exact = search_top_k(bert_index_exact, bert_query_embedding[i:i+1], k=topk)
  D_ivf, I_ivf = search_top_k(bert_index_ivf, bert_query_embedding[i:i+1], k=topk)
  results[query]["BERT_Exact"] = [(titles[idx], genres[idx], plots[idx]) for idx in I_exact[0]]
  results[query]["BERT_IVF"] = [(titles[idx], genres[idx], plots[idx]) for idx in I_ivf[0]]

## Query 1 BERT results

In [73]:
# results[queries[0]]
print("Query 1 Exact NN Precision: 5/7")
print("Query 1 ANN Precision: 4/7")

Query 1 Exact NN Precision: 5/7
Query 1 ANN Precision: 4/7


## Query 2 BERT results

In [74]:
# results[queries[1]]
print("Query 2 Exact NN Precision: 5/7")
print("Query 2 ANN Precision: 3/7")

Query 2 Exact NN Precision: 5/7
Query 2 ANN Precision: 3/7


## Query 3 BERT results

In [75]:
results[queries[2]]
print("Query 3 Exact NN Precision: 4/7")
print("Query 3 ANN Precision: 7/7")

Query 3 Exact NN Precision: 4/7
Query 3 ANN Precision: 7/7
