<a href="https://colab.research.google.com/github/sarankumar1325/NLP-BASICS/blob/main/Recommender_System_with_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Download the dataset
!wget https://lazyprogrammer.me/course_files/nlp/tmdb_5000_movies.csv

--2025-06-23 10:46:05--  https://lazyprogrammer.me/course_files/nlp/tmdb_5000_movies.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 172.67.213.166, 104.21.23.210, 2606:4700:3031::6815:17d2, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|172.67.213.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5698602 (5.4M) [text/csv]
Saving to: ‘tmdb_5000_movies.csv’


2025-06-23 10:46:05 (12.9 MB/s) - ‘tmdb_5000_movies.csv’ saved [5698602/5698602]



In [None]:
# Step 2: Imports
import pandas as pd
import matplotlib.pyplot as plt
import json

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Step 3: Load the CSV file
df = pd.read_csv('tmdb_5000_movies.csv')


In [None]:
# Step 4: Parse genres and keywords from JSON
def parse_features(s):
    try:
        J = json.loads(s)
        return " ".join(["".join(item['name'].split()) for item in J])
    except:
        return ""

In [None]:
# Step 5: Combine genres and keywords into one string
def genres_keywords_to_string(row):
    genres = parse_features(row['genres'])
    keywords = parse_features(row['keywords'])
    return genres + " " + keywords

df['string'] = df.apply(genres_keywords_to_string, axis=1)

In [None]:
# Step 6: TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=2000)
X = tfidf.fit_transform(df['string'])

In [None]:
# Step 7: Create index mapping from title to row number
movie_to_idx = pd.Series(df.index, index=df['title'])

In [None]:
def recommend(title):
    idx = movie_to_idx[title]
    if isinstance(idx, pd.Series):
        idx = idx.iloc[0]

    query = X[idx]
    scores = cosine_similarity(query, X).flatten()
    sorted_indices = scores.argsort()[::-1]
    recommended_idx = sorted_indices[1:6]
    return df.iloc[recommended_idx][['title']]

In [None]:
# Step 9: Test the recommender
print("Recommendations for 'Scream 3':")
print(recommend("Scream 3"))

Recommendations for 'Scream 3':
                                     title
3902  Friday the 13th Part VI: Jason Lives
4628                        Graduation Day
4053      Friday the 13th: A New Beginning
4048                           The Calling
1084                       The Glimmer Man


In [None]:
print("\nRecommendations for 'Mortal Kombat':")
print(recommend("Mortal Kombat"))



Recommendations for 'Mortal Kombat':
                                      title
1611            Mortal Kombat: Annihilation
1670                     DOA: Dead or Alive
3856            In the Name of the King III
1001  Street Fighter: The Legend of Chun-Li
2237                      Alone in the Dark


In [None]:
print("\nRecommendations for 'Runaway Bride':")
print(recommend("Runaway Bride"))


Recommendations for 'Runaway Bride':
                           title
4115                  House of D
2325  My Big Fat Greek Wedding 2
3313                An Education
4604       It Happened One Night
2689          Our Family Wedding
