In [None]:
# Cell 1: Setup & Imports
!pip install sentence-transformers pandas numpy scikit-learn torch

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
import torch




In [None]:
# Cell 2: Load and preprocess the dataset

# Load your dataset (CSV with all columns mentioned)
df = pd.read_csv("netflix_titles.csv")

# Fill missing values
df.fillna("", inplace=True)

# Create a combined text field for content representation
df["content_text"] = (
    df["title"] + ". " +
    df["description"] + ". " +
    df["cast"] + ". " +
    df["listed_in"] + ". " +
    df["director"]
)

# Display basic info
print("Dataset size:", len(df))
df.head()


Dataset size: 8807


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,content_text
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Dick Johnson Is Dead. As her father nears the ...
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Blood & Water. After crossing paths at a party...
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,Ganglands. To protect his family from a powerf...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...","Jailbirds New Orleans. Feuds, flirtations and ..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,Kota Factory. In a city of coaching centers kn...


In [None]:
# Cell 3: Encode textual content using SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate 384-d embeddings
item_embeddings = model.encode(df["content_text"].tolist(), show_progress_bar=True)

# Save for later use
np.save("item_embeddings.npy", item_embeddings)

print("Embeddings shape:", item_embeddings.shape)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/276 [00:00<?, ?it/s]

Embeddings shape: (8807, 384)


In [None]:
# Cell 4: Define hybrid novelty-aware recommender

import torch.nn as nn
import torch.nn.functional as F

class HybridNoveltyRecommender(nn.Module):
    def __init__(self, num_items, item_emb_dim=384, user_feat_dim=384, hidden=128):
        super().__init__()
        self.item_proj = nn.Linear(item_emb_dim, hidden)
        self.item_id_emb = nn.Embedding(num_items, hidden)
        self.user_proj = nn.Linear(user_feat_dim, hidden)

        self.fusion = nn.Sequential(
            nn.Linear(hidden * 3, hidden),
            nn.ReLU(),
            nn.Linear(hidden, 1)
        )

        self.novelty_head = nn.Sequential(
            nn.Linear(hidden, hidden // 2),
            nn.ReLU(),
            nn.Linear(hidden // 2, 1),
            nn.Sigmoid()
        )

    def forward(self, item_content_emb, item_id, user_feat):
        c = F.relu(self.item_proj(item_content_emb))
        i = self.item_id_emb(item_id)
        u = F.relu(self.user_proj(user_feat))
        x = torch.cat([c, i, u], dim=-1)
        relevance = self.fusion(x).squeeze(-1)
        novelty = self.novelty_head(c).squeeze(-1)
        return relevance, novelty


In [None]:
# Cell 5: Create mock user-item interaction matrix (for demo)
# In real use, replace with real watch history data

num_items = len(df)
num_users = 500

np.random.seed(42)
interactions = []

for u in range(num_users):
    liked = np.random.choice(num_items, size=np.random.randint(5, 20), replace=False)
    for i in liked:
        interactions.append([u, i, 1])
    disliked = np.random.choice(num_items, size=len(liked), replace=False)
    for j in disliked:
        interactions.append([u, j, 0])

interactions = pd.DataFrame(interactions, columns=["user_id", "item_id", "label"])
print(interactions.head())


   user_id  item_id  label
0        0     7661      1
1        0     6494      1
2        0     3887      1
3        0     7492      1
4        0     5107      1


In [None]:
interactions.to_csv("interactions.csv", index=False)

In [None]:
# Cell 6: Train the model

device = "cuda" if torch.cuda.is_available() else "cpu"

model_nn = HybridNoveltyRecommender(num_items=num_items).to(device)
optimizer = torch.optim.Adam(model_nn.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

# random user features (demo)
user_features = torch.randn(num_users, 384).to(device)
item_emb_tensor = torch.tensor(item_embeddings, dtype=torch.float32).to(device)

epochs = 100
for epoch in range(epochs):
    total_loss = 0
    for idx in range(0, len(interactions), 512):
        batch = interactions.iloc[idx:idx+512]
        user_ids = torch.tensor(batch["user_id"].values, dtype=torch.long).to(device)
        item_ids = torch.tensor(batch["item_id"].values, dtype=torch.long).to(device)
        labels = torch.tensor(batch["label"].values, dtype=torch.float32).to(device)

        user_feat = user_features[user_ids]
        item_content = item_emb_tensor[item_ids]

        relevance, novelty = model_nn(item_content, item_ids, user_feat)
        score = relevance + 0.5 * novelty
        loss = loss_fn(score, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")


Epoch 1/100, Loss: 16.7046
Epoch 2/100, Loss: 16.2448
Epoch 3/100, Loss: 15.7884
Epoch 4/100, Loss: 15.0609
Epoch 5/100, Loss: 14.0003
Epoch 6/100, Loss: 12.6243
Epoch 7/100, Loss: 11.0236
Epoch 8/100, Loss: 9.6596
Epoch 9/100, Loss: 8.9565
Epoch 10/100, Loss: 7.6714
Epoch 11/100, Loss: 6.9014
Epoch 12/100, Loss: 6.2694
Epoch 13/100, Loss: 8.5072
Epoch 14/100, Loss: 7.4796
Epoch 15/100, Loss: 4.8693
Epoch 16/100, Loss: 3.7354
Epoch 17/100, Loss: 3.1342
Epoch 18/100, Loss: 2.7079
Epoch 19/100, Loss: 2.3419
Epoch 20/100, Loss: 2.0977
Epoch 21/100, Loss: 1.9843
Epoch 22/100, Loss: 1.7783
Epoch 23/100, Loss: 1.7844
Epoch 24/100, Loss: 2.2823
Epoch 25/100, Loss: 2.0968
Epoch 26/100, Loss: 2.0695
Epoch 27/100, Loss: 3.1208
Epoch 28/100, Loss: 5.0474
Epoch 29/100, Loss: 5.7680
Epoch 30/100, Loss: 15.6101
Epoch 31/100, Loss: 7.3835
Epoch 32/100, Loss: 3.4955
Epoch 33/100, Loss: 2.0490
Epoch 34/100, Loss: 1.5536
Epoch 35/100, Loss: 1.2618
Epoch 36/100, Loss: 1.0605
Epoch 37/100, Loss: 0.9126
Ep

In [None]:
torch.save(model_nn.state_dict(), 'hybrid_model.pt')
torch.save(optimizer.state_dict(), 'optimizer.pt')

In [None]:
from difflib import SequenceMatcher, get_close_matches
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch

def smart_hybrid_search(query, df, item_emb_tensor, model_nn, alpha=0.7, topk_recommend=10):
    """
    Improved hybrid recommender that:
    1. Shows exact title if found.
    2. If typo, finds closest 2 titles.
    3. In all cases, recommends semantically similar movies next.
    """
    query = query.strip().lower()

    # --- Step 1: Exact or partial title match ---
    title_matches = df[df["title"].str.lower().str.contains(query, na=False)]
    if not title_matches.empty:
        print("🎯 Exact/Partial Title Found!")
        seed_titles = title_matches["title"].tolist()
    else:
        # --- Step 2: Fuzzy match for typos ---
        all_titles = df["title"].fillna("").tolist()
        close_titles = get_close_matches(query, all_titles, n=2, cutoff=0.6)
        if close_titles:
            print("🔍 Possible typo detected — closest matches:")
            for t in close_titles:
                print(f"   → {t}")
            seed_titles = close_titles
        else:
            print("🧠 No title match — pure semantic mode.")
            seed_titles = []

    # --- Step 3: Prepare embeddings for semantic recommendation ---
    search_text = query
    if seed_titles:
        # combine known matches and user query for richer context
        search_text += " " + " ".join(seed_titles)

    search_emb = model.encode(search_text, convert_to_tensor=True).to(device)
    query_feat = search_emb.unsqueeze(0).repeat(len(df), 1)
    item_ids = torch.arange(len(df)).to(device)
    item_content = item_emb_tensor

    with torch.no_grad():
        relevance, novelty = model_nn(item_content, item_ids, query_feat)
        final_score = relevance + 0.5 * novelty
    semantic_scores = final_score.cpu().numpy()

    # --- Step 4: Combine semantic + fuzzy scores ---
    title_scores = np.array([
        SequenceMatcher(None, query, str(title).lower()).ratio()
        for title in df["title"]
    ])
    combined_scores = alpha * semantic_scores + (1 - alpha) * title_scores
    top_indices = np.argsort(combined_scores)[::-1][:topk_recommend]

    # --- Step 5: Build output ---
    recommendations = df.iloc[top_indices][["title", "release_year", "listed_in", "description"]]
    result = []

    # First show matched/typo-corrected titles if any
    for s in seed_titles:
        matched_row = df[df["title"].str.lower() == s.lower()]
        if not matched_row.empty:
            result.append(matched_row.iloc[0])

    # Then add semantic recommendations
    for _, row in recommendations.iterrows():
        if row["title"] not in seed_titles:
            result.append(row)

    final_df = pd.DataFrame(result).drop_duplicates("title").head(topk_recommend)
    return final_df


In [None]:
for q in ["Inception", "Incepton", "space drama", "love in Paris"]:
    print("\n🔍 Query:", q)
    results = smart_hybrid_search(q, df, item_emb_tensor, model_nn, alpha=0.7)
    for _, row in results.iterrows():
        print(f"- {row['title']} ({row['release_year']}) — {row['listed_in']}")



🔍 Query: Inception
🎯 Exact/Partial Title Found!
- Inception (2010) — Action & Adventure, Sci-Fi & Fantasy, Thrillers
- Centaurworld (2021) — Kids' TV, TV Comedies
- Marvel's Jessica Jones (2019) — Crime TV Shows, TV Action & Adventure, TV Dramas
- Hjørdis (2015) — International TV Shows, TV Comedies, TV Dramas
- BLAME! (2017) — Action & Adventure, Anime Features, International Movies
- Paid in Full (2002) — Action & Adventure, Dramas
- Pagglait (2021) — Comedies, Dramas, International Movies
- The Hard Way (2019) — Action & Adventure
- Quién te cantará (2018) — Dramas, Independent Movies, International Movies
- Team Kaylie (2020) — Kids' TV, TV Comedies

🔍 Query: Incepton
🔍 Possible typo detected — closest matches:
   → Inception
   → Lincoln
- Inception (2010) — Action & Adventure, Sci-Fi & Fantasy, Thrillers
- Lincoln (2012) — Dramas
- Centaurworld (2021) — Kids' TV, TV Comedies
- Marvel's Jessica Jones (2019) — Crime TV Shows, TV Action & Adventure, TV Dramas
- Hjørdis (2015) — Int