### Importing libraries

In [1]:
import nltk
import numpy as np
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


### Downloading NLTK resources

In [None]:

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/srinityak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/srinityak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/srinityak/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
df = pd.read_csv('IMDb_Movies.csv')
df = df[df['overview'].notna()].reset_index(drop=True)

### Processing the input text - converting to lowercase, removing numbers and special characters using lemmatizer and getting a string

In [None]:
lemmatizer = WordNetLemmatizer()

def clean_and_lemmatize(text):
    text = text.lower()  
    text = re.sub(r'\d+', '', text)  
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(tokens)

df["overview"] = df["overview"].apply(clean_and_lemmatize)

### Computing TF-IDF matrix using TfidfVectorizer

In [5]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_df=0.85,
    min_df=3,
    ngram_range=(1, 2),
    sublinear_tf=True
)
tfidf_matrix = vectorizer.fit_transform(df["overview"])

### Movie Recommendation - 
Computing the cosine similarity between two vectors. And then computing the TF-IDF vector of the user-input text and comparing it with each movie's vector. The top_5 movies with the highest cosine similarity scores are returned.

In [7]:
def get_recommendations(user_query, top_n=5):
    user_query_cleaned = clean_and_lemmatize(user_query)
    query_vec = vectorizer.transform([user_query_cleaned])
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    normalized_similarities = similarities / similarities.max()
    top_indices = normalized_similarities.argsort()[::-1][:top_n]
    recommendations = df.iloc[top_indices].copy()
    recommendations["similarity_score"] = normalized_similarities[top_indices]
    return recommendations

user_preference = "I like action movies"
recommendations = get_recommendations(user_preference, top_n=5)
print(recommendations[["title","similarity_score"]])

                 title  similarity_score
394   Last Action Hero          1.000000
471               Nine          0.698250
265         The Island          0.570009
24           King Kong          0.543391
203  The Bourne Legacy          0.521596
