In [1]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
df_processed = pd.read_csv("df1_cleaned.csv")

In [3]:
texts = df_processed["clean_plot"].tolist()


In [4]:
embeddings = model.encode(
    texts,
    batch_size=64,
    show_progress_bar=True,
    normalize_embeddings=True
)


Batches: 100%|██████████| 546/546 [11:49<00:00,  1.30s/it]


In [5]:
embeddings.shape


(34884, 384)

In [6]:
import faiss

dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product (cosine)
index.add(embeddings)


In [7]:
index.ntotal


34884

In [8]:
def semantic_search(query, top_k=5):
    query_vec = model.encode(
        [query],
        normalize_embeddings=True
    )
    
    scores, indices = index.search(query_vec, top_k)
    
    results = []
    for idx, score in zip(indices[0], scores[0]):
        movie = df_processed.iloc[idx]
        results.append({
            "title": movie["Title"],
            "year": movie["Release Year"],
            "score": float(score)
        })
    
    return results


In [9]:
semantic_search("movies about war and friendship", top_k=5)


[{'title': 'A Soldier Speaks after Death',
  'year': np.int64(1966),
  'score': 0.5737587213516235},
 {'title': 'The Good Wife',
  'year': np.int64(1987),
  'score': 0.5450994968414307},
 {'title': 'Bless Me, Ultima',
  'year': np.int64(2013),
  'score': 0.5444862842559814},
 {'title': 'Once Upon a Dream',
  'year': np.int64(1949),
  'score': 0.5386888384819031},
 {'title': 'Brothers', 'year': np.int64(2016), 'score': 0.5273308753967285}]

In [11]:
import numpy as np

np.save("artifacts/movie_embeddings.npy", embeddings)


In [12]:
import faiss

faiss.write_index(index, "artifacts/movie_faiss.index")
