In [13]:
import pandas as pd
import faiss
import pickle
from sentence_transformers import SentenceTransformer
import os

In [14]:
BASE_DIR = os.path.abspath('..')
CSV_PATH = os.path.join(BASE_DIR, 'data', 'processed_data', 'web_series', 'web_series.csv')
ARTIFACT_DIR = os.path.join(BASE_DIR, 'artifacts', 'web_series')
os.makedirs(ARTIFACT_DIR, exist_ok=True)

In [15]:
df = pd.read_csv(CSV_PATH)

In [16]:
df

Unnamed: 0,title,release year,genre,synopsis
0,Breaking Bad,2008,"Crime,Drama","When Walter White, a New Mexico chemistry teac..."
1,Game of Thrones,2011,"Action & Adventure,Drama",Seven noble families fight for control of the ...
2,Rick and Morty,2013,"Animation,Comedy",Rick is a mentally-unbalanced but scientifical...
3,Stranger Things,2016,"Drama,Fantasy","When a young boy vanishes, a small town uncove..."
4,The Boys,2019,"Action & Adventure,Comedy",A group of vigilantes known informally as “The...
...,...,...,...,...
21798,The Rag Trade,1975,Comedy,Add a Plot
21799,Die Real Housewives van Pretoria,2022,Reality-TV,Focuses on the personal and professional lives...
21800,Abandoned Engineering,2016,Documentary,Structures of steel and iron eerily stand unfi...
21801,"Diomedes, el Cacique de La Junta",2015,Drama,The life and work of the vallenato musician Di...


In [17]:
df['tags'] = (
    df['title'].fillna('') + ' ' +
    df['release year'].fillna(0).astype(str) + ' ' +
    df['genre'].fillna('') + ' ' +
    df['synopsis'].fillna('')
)

In [18]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['tags'].tolist(), show_progress_bar=True, convert_to_numpy=True)

Batches: 100%|███████████████████████████████████████████████████████████████████████| 682/682 [10:25<00:00,  1.09it/s]


In [19]:
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [20]:
pickle.dump(df, open(os.path.join(ARTIFACT_DIR, 'data.pkl'), 'wb'))
pickle.dump(model, open(os.path.join(ARTIFACT_DIR, 'sbert_model.pkl'), 'wb'))
faiss.write_index(index, os.path.join(ARTIFACT_DIR, 'faiss_index.index'))

In [21]:
from rapidfuzz import process
import os
import pickle
import faiss
import numpy as np

def recommend_web_series(title_query, top_k=5):
    ARTIFACT_DIR = os.path.join(BASE_DIR, 'artifacts', 'web_series')

    # Load data, model, and index
    df = pickle.load(open(os.path.join(ARTIFACT_DIR, 'data.pkl'), 'rb'))
    model = pickle.load(open(os.path.join(ARTIFACT_DIR, 'sbert_model.pkl'), 'rb'))
    index = faiss.read_index(os.path.join(ARTIFACT_DIR, 'faiss_index.index'))

    # Step 1: Try exact match
    matches = df[df['title'].str.lower() == title_query.lower()]
    if not matches.empty:
        idx = matches.index[0]
        query = df.loc[idx, 'tags']
        query_vec = model.encode([query], convert_to_numpy=True)
        _, I = index.search(query_vec, top_k + 1)
        return df.iloc[I[0][1:]][['title', 'genre', 'synopsis']]
    
    # Step 2: Fallback to closest match using RapidFuzz
    all_titles = df['title'].fillna('').tolist()
    best_match, score, idx = process.extractOne(title_query, all_titles)
    print(f"⚠️ Title not found. Using closest match: '{best_match}' (Score: {score:.2f})")

    query = df.loc[idx, 'tags']
    query_vec = model.encode([query], convert_to_numpy=True)
    _, I = index.search(query_vec, top_k + 1)
    return df.iloc[I[0][1:]][['title', 'genre', 'synopsis']]

In [22]:
recommend_web_series('wednesday')

Unnamed: 0,title,genre,synopsis
14254,Adult Wednesday Addams,Comedy,"The chronicles of Wednesday, from ""The Addams ..."
15491,Now and Then,"Crime, Drama, Mystery",A multi-layered thriller that explores the dif...
19790,More Than This,Drama,Explore the real lives of five 17-year-old tee...
13502,High School,Comedy,Based on the autobiographical book High School...
16228,Krakowskie potwory,"Adventure, Drama, Fantasy",A young woman haunted by her past joins a myst...
