In [1]:
!pip install --upgrade sentence-transformers faiss-cpu scikit-learn pandas numpy

Collecting numpy
  Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl.metadata (60 kB)




In [11]:
import os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# Optional backends for fast search (use either FAISS or sklearn NearestNeighbors)
try:
    import faiss  # faiss-cpu
    FAISS_AVAILABLE = True
except Exception:
    FAISS_AVAILABLE = False

from sklearn.neighbors import NearestNeighbors

# Paths
DATA_PATH = "movies.csv"
EMB_PATH = "plot_embeddings.npz"

# Load CSV
df = pd.read_csv(DATA_PATH)

# Model
MODEL_NAME = "all-MiniLM-L6-v2"  # lightweight and fast
model = SentenceTransformer(MODEL_NAME)

# Generate embeddings
print("Generating embeddings...")
embeddings = model.encode(df['plot'].tolist(), convert_to_numpy=True, show_progress_bar=True)

# Save embeddings
np.savez_compressed(EMB_PATH, embeddings=embeddings)
print(f"Embeddings saved to {EMB_PATH}")


Generating embeddings...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Embeddings saved to plot_embeddings.npz


In [12]:
# Your CSV must have columns: 'title' and 'plot'
df = pd.read_csv(DATA_PATH)
assert {'title','plot'}.issubset(df.columns), "CSV must contain 'title' and 'plot' columns"

print(df.head())
print(f"\nTotal movies: {len(df)}")


              title                                               plot
0         Spy Movie  A spy navigates intrigue in Paris to stop a te...
1  Romance in Paris  A couple falls in love in Paris under romantic...
2      Action Flick  A high-octane chase through New York with expl...
3     Space Odyssey  A crew journeys through space to explore a dis...
4     Haunted Manor   A family moves into a mansion haunted by ghosts.

Total movies: 54


In [13]:
model = SentenceTransformer(MODEL_NAME, device="cpu")

def build_or_load_embeddings(df: pd.DataFrame, emb_path: str = EMB_PATH, batch_size: int = 64):
    """Encode plots to embeddings. Cache to disk for re-use."""
    if os.path.exists(emb_path):
        data = np.load(emb_path)
        embeddings = data["embeddings"]
        if embeddings.shape[0] == len(df):
            print(f"Loaded cached embeddings from {emb_path} -> shape={embeddings.shape}")
            return embeddings
        else:
            print("Cache exists but row count mismatch. Recomputing...")

    texts = df['plot'].fillna("").astype(str).tolist()
    t0 = time.time()
    embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True)
    t1 = time.time()
    print(f"Encoded {len(texts)} plots in {t1 - t0:.2f}s. Shape: {embeddings.shape}")

    # Save
    np.savez_compressed(emb_path, embeddings=embeddings)
    print(f"Saved embeddings to {emb_path}")
    return embeddings

embeddings = build_or_load_embeddings(df)
embeddings = embeddings.astype('float32')  # for FAISS compatibility


Loaded cached embeddings from plot_embeddings.npz -> shape=(54, 384)


In [14]:
# FAISS (if available) or sklearn NearestNeighbors
index = None
search_backend = None

if FAISS_AVAILABLE:
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)  # inner product, use normalized vectors => cosine similarity
    index.add(embeddings)
    search_backend = "faiss"
    print(f"Built FAISS index (dim={d}, size={index.ntotal}).")
else:
    nn = NearestNeighbors(n_neighbors=min(20, len(df)), metric="cosine", algorithm="auto")
    nn.fit(embeddings)
    index = nn
    search_backend = "sklearn"
    print("Built sklearn NearestNeighbors index (cosine distance).")


Built sklearn NearestNeighbors index (cosine distance).


In [15]:
def semantic_search(query: str, top_k: int = 5):
    """Return top_k results as a DataFrame with title, plot, score."""
    q_emb = model.encode([query], normalize_embeddings=True)
    q_emb = q_emb.astype('float32')

    if search_backend == "faiss":
        # inner product on normalized vectors equals cosine similarity
        scores, idx = index.search(q_emb, top_k)
        scores = scores[0].tolist()
        idx = idx[0].tolist()
        # Clamp due to FAISS returning -1 for empty
        pairs = [(i, float(s)) for i, s in zip(idx, scores) if i != -1]
    else:
        # sklearn returns cosine distance; convert to similarity
        distances, idx = index.kneighbors(q_emb, n_neighbors=top_k, return_distance=True)
        distances = distances[0].tolist()
        idx = idx[0].tolist()
        pairs = [(i, float(1.0 - d)) for i, d in zip(idx, distances)]

    results = []
    for i, sim in pairs:
        row = df.iloc[i]
        results.append({
            "rank": len(results) + 1,
            "score_cosine": round(sim, 4),
            "title": row["title"],
            "plot": row["plot"]
        })
    return pd.DataFrame(results)

# Quick smoke test
display(semantic_search("spy thriller in Paris", top_k=5))


Unnamed: 0,rank,score_cosine,title,plot
0,1,0.7697,Spy Movie,A spy navigates intrigue in Paris to stop a te...
1,2,0.4561,Crime City,A gritty crime story set in the underworld of ...
2,3,0.4543,Detective Chronicles,A detective solves a complex murder mystery in...
3,4,0.4393,Cyber Detective,A detective solves crimes in a futuristic digi...
4,5,0.4107,Supernatural Investigation,A detective investigates unexplained supernatu...


In [16]:
def search(query: str, k: int = 5):
    display(semantic_search(query, top_k=k))

print("Example 1: 'spy thriller in Paris'")
search("spy thriller in Paris", 5)

print("\nExample 2: 'romantic drama in Paris'")
search("romantic drama in Paris", 5)

print("\nExample 3: 'intelligence officer uncovering a mole'")
search("intelligence officer uncovering a mole", 5)


Example 1: 'spy thriller in Paris'


Unnamed: 0,rank,score_cosine,title,plot
0,1,0.7697,Spy Movie,A spy navigates intrigue in Paris to stop a te...
1,2,0.4561,Crime City,A gritty crime story set in the underworld of ...
2,3,0.4543,Detective Chronicles,A detective solves a complex murder mystery in...
3,4,0.4393,Cyber Detective,A detective solves crimes in a futuristic digi...
4,5,0.4107,Supernatural Investigation,A detective investigates unexplained supernatu...



Example 2: 'romantic drama in Paris'


Unnamed: 0,rank,score_cosine,title,plot
0,1,0.6242,Romance in Paris,A couple falls in love in Paris under romantic...
1,2,0.4473,Spy Movie,A spy navigates intrigue in Paris to stop a te...
2,3,0.4088,Crime City,A gritty crime story set in the underworld of ...
3,4,0.4005,Love Triangle,Romantic complications arise between three peo...
4,5,0.3773,Historical Romance,Love blossoms amidst historical turmoil and war.



Example 3: 'intelligence officer uncovering a mole'


Unnamed: 0,rank,score_cosine,title,plot
0,1,0.4195,Supernatural Investigation,A detective investigates unexplained supernatu...
1,2,0.3975,Spy Movie,A spy navigates intrigue in Paris to stop a te...
2,3,0.3639,Cyber Hack,A hacker uncovers a conspiracy involving a pow...
3,4,0.3358,Cyber Detective,A detective solves crimes in a futuristic digi...
4,5,0.3078,Detective Chronicles,A detective solves a complex murder mystery in...
