In [1]:
import pandas as pd
import faiss
import pickle
from sentence_transformers import SentenceTransformer
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_DIR = os.path.abspath('..')
CSV_PATH = os.path.join(BASE_DIR, 'data', 'processed_data', 'movie', 'movie.csv')
ARTIFACT_DIR = os.path.join(BASE_DIR, 'artifacts', 'movie')
os.makedirs(ARTIFACT_DIR, exist_ok=True)

In [3]:
df = pd.read_csv(CSV_PATH)
df

Unnamed: 0,title,genres,release_year,summary
0,The Yards,"Crime,Drama",2000,"Ex-con Leo tries to go straight, but his plans..."
1,Next Friday,Comedy,2000,A streetwise man flees South Central Los Angel...
2,Supernova,"Adventure,Sci-Fi,Thriller",2000,A deep space rescue and recovery spaceship wit...
3,Down to You,"Comedy,Romance",2000,College students Al and Imogen meet and fall i...
4,Wirey Spindell,Comedy,2000,A whimsical look back at the life of a man who...
...,...,...,...,...
51441,End of the Season,Drama,2017,Becker is a German ex-con trying to hold down ...
51442,The Monroy Affaire,Drama,2022,
51443,Shelter in Solitude,"Comedy,Drama",2023,A death row prisoner with 10 days left to live...
51444,Orca,Drama,2023,"A young Iranian woman, having reached the lowe..."


In [31]:
df = df.drop_duplicates(subset='title')

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48600 entries, 0 to 51445
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         48600 non-null  object
 1   genres        48600 non-null  object
 2   release_year  48600 non-null  int64 
 3   summary       47976 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.9+ MB


In [33]:
df['tags'] = df['title'].fillna('') + ' ' + df['genres'].fillna('') + ' ' + df['summary'].fillna('') + ' ' + df['release_year'].fillna(0).astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['title'].fillna('') + ' ' + df['genres'].fillna('') + ' ' + df['summary'].fillna('') + ' ' + df['release_year'].fillna(0).astype(str)


In [34]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['tags'].tolist(), show_progress_bar=True, convert_to_numpy=True)

Batches: 100%|█████████████████████████████████████████████████████████████████████| 1519/1519 [51:14<00:00,  2.02s/it]


In [35]:
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [36]:
pickle.dump(df, open(os.path.join(ARTIFACT_DIR, 'data.pkl'), 'wb'))
pickle.dump(model, open(os.path.join(ARTIFACT_DIR, 'sbert_model.pkl'), 'wb'))
faiss.write_index(index, os.path.join(ARTIFACT_DIR, 'faiss_index.index'))

In [7]:
import os
import pickle
import faiss
import numpy as np
from rapidfuzz import process

# Set global BASE_DIR
BASE_DIR = os.path.abspath('..')  # adjust if needed

def recommend_movie(title_query, top_k=5):
    ARTIFACT_DIR = os.path.join(BASE_DIR, 'artifacts', 'movie')
    
    # Load necessary files
    df = pickle.load(open(os.path.join(ARTIFACT_DIR, 'data.pkl'), 'rb'))
    model = pickle.load(open(os.path.join(ARTIFACT_DIR, 'sbert_model.pkl'), 'rb'))
    index = faiss.read_index(os.path.join(ARTIFACT_DIR, 'faiss_index.index'))
    
    # STEP 1: Try exact match
    matches = df[df['title'].str.lower() == title_query.lower()]
    if not matches.empty:
        idx = matches.index[0]
        query = df.loc[idx, 'tags']
        query_vec = model.encode([query], convert_to_numpy=True)
        _, I = index.search(query_vec, top_k + 1)
        return df.iloc[I[0][1:]][['title', 'genres', 'summary']]
    
    # STEP 2: Fallback to RapidFuzz for close match
    all_titles = df['title'].fillna('').tolist()
    best_match, score, idx = process.extractOne(title_query, all_titles)
    print(f"⚠️ Title not found. Using closest match: '{best_match}' (Score: {score:.2f})")
    
    # Re-run SBERT search using the best match
    query = df.loc[idx, 'tags']
    query_vec = model.encode([query], convert_to_numpy=True)
    _, I = index.search(query_vec, top_k + 1)
    return df.iloc[I[0][1:]][['title', 'genres', 'summary']]

In [8]:
recommend_movie("Yards")

⚠️ Title not found. Using closest match: 'The Yards' (Score: 90.00)


Unnamed: 0,title,genres,summary
138,Circus,"Crime,Drama,Thriller",Conman Leo Garfield is in hot water after acce...
3105,The Contract,"Crime,Drama,Thriller",A man and his son encounter an assassin in the...
18817,The Yard,(no genres listed),A poet and writer gets fired from his job at a...
9045,The Bag Man,"Crime,Drama,Thriller","A criminal bides his time at a seedy motel, wa..."
2709,Waist Deep,"Action,Crime,Drama,Thriller",An ex-convict (Tyrese) gets tangled up with a ...
