### Importing Packages 

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import ipywidgets as widgets
from IPython.display import display
import ipywidgets as widgets

### Reading the dataset

In [2]:
movies = pd.read_csv("movies_finalized_dataset1.csv")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres,imdbId,imdb_url,avg_rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,https://www.imdb.com/title/tt0114709/,3.897438,children Disney animation children Disney Disn...
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,https://www.imdb.com/title/tt0113497/,3.275758,Robin Williams fantasy Robin Williams time tra...
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,https://www.imdb.com/title/tt0113228/,3.139447,comedinha de velhinhos engraÃƒÂ§ada comedinha ...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,https://www.imdb.com/title/tt0114885/,2.845331,characters slurs based on novel or book chick ...
4,5,Father of the Bride Part II (1995),Comedy,113041,https://www.imdb.com/title/tt0113041/,3.059602,Fantasy pregnancy remake family Steve Martin s...


In [4]:
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [5]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [6]:
movies

Unnamed: 0,movieId,title,genres,imdbId,imdb_url,avg_rating,tag,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,https://www.imdb.com/title/tt0114709/,3.897438,children Disney animation children Disney Disn...,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,https://www.imdb.com/title/tt0113497/,3.275758,Robin Williams fantasy Robin Williams time tra...,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,https://www.imdb.com/title/tt0113228/,3.139447,comedinha de velhinhos engraÃƒÂ§ada comedinha ...,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,https://www.imdb.com/title/tt0114885/,2.845331,characters slurs based on novel or book chick ...,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,113041,https://www.imdb.com/title/tt0113041/,3.059602,Fantasy pregnancy remake family Steve Martin s...,Father of the Bride Part II 1995
...,...,...,...,...,...,...,...,...
87580,292731,The Monroy Affaire (2022),Drama,26812510,https://www.imdb.com/title/tt26812510/,4.000000,,The Monroy Affaire 2022
87581,292737,Shelter in Solitude (2023),Comedy|Drama,14907358,https://www.imdb.com/title/tt14907358/,1.500000,,Shelter in Solitude 2023
87582,292753,Orca (2023),Drama,12388280,https://www.imdb.com/title/tt12388280/,4.000000,,Orca 2023
87583,292755,The Angry Breed (1968),Drama,64027,https://www.imdb.com/title/tt0064027/,1.000000,,The Angry Breed 1968


### Initialize vectorizer with default settings

In [7]:


vectorizer = TfidfVectorizer()  
tfidf = vectorizer.fit_transform(movies["clean_title"]) 

### Content based search and recommendation using cosine similarity

In [27]:


def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

### Provides a text box if you search for a paricular movie .It also displays the movies which are nearly similar

In [28]:

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [10]:
movie_id = 89745

movie = movies[movies["movieId"] == movie_id]

In [11]:
ratings = pd.read_csv("ratings.csv")

In [12]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [13]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [14]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [15]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [16]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [17]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [18]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [19]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
89745,1.000000,0.045973
58559,0.582796,0.171355
59315,0.538966,0.061651
79132,0.531583,0.158802
7153,0.498301,0.187755
...,...,...
5218,0.101840,0.024993
106072,0.101254,0.005899
4022,0.100785,0.039028
118696,0.100551,0.011680


In [20]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [21]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [22]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")


Unnamed: 0,similar,all,score,movieId,title,genres,imdbId,imdb_url,avg_rating,tag,clean_title
17071,1.0,0.045973,21.751904,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,848228,https://www.imdb.com/title/tt0848228/,3.733574,superhero Marvel predictable action Captain Am...,Avengers The 2012
25092,0.243408,0.01417,17.178215,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi,2395427,https://www.imdb.com/title/tt2395427/,3.487106,joss whedon Brian Tyler Danny Elfman artificia...,Avengers Age of Ultron 2015
20520,0.101254,0.005899,17.163146,106072,Thor: The Dark World (2013),Action|Adventure|Fantasy|IMAX,1981115,https://www.imdb.com/title/tt1981115/,3.185643,Brian Tyler aliens bad physics bad science Mar...,Thor The Dark World 2013
19687,0.217743,0.013922,15.640495,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,1300854,https://www.imdb.com/title/tt1300854/,3.462625,Marvel Robert Downey Jr. superhero Robert Down...,Iron Man 3 2013
16728,0.223251,0.014374,15.531251,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War,458339,https://www.imdb.com/title/tt0458339/,3.472748,superhero cheesy Alan Silvestri superhero The ...,Captain America The First Avenger 2011
16315,0.178132,0.011659,15.278602,86332,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,800369,https://www.imdb.com/title/tt0800369/,3.315509,superhero action chris hemsworth funny superhe...,Thor 2011
21357,0.303293,0.020845,14.549994,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,1843866,https://www.imdb.com/title/tt1843866/,3.685033,Captain America Marvel Cinematic Universe supe...,Captain America The Winter Soldier 2014
25105,0.24915,0.017267,14.428876,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller,3498820,https://www.imdb.com/title/tt3498820/,3.695735,Henry Jackman captain america comic book iron ...,Captain America Civil War 2016
14629,0.236611,0.016971,13.941935,77561,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX,1228705,https://www.imdb.com/title/tt1228705/,3.42251,action hot ladies Iron Man Robert Downey Jr. r...,Iron Man 2 2010
25095,0.151764,0.010985,13.814966,122900,Ant-Man (2015),Action|Adventure|Sci-Fi,478970,https://www.imdb.com/title/tt0478970/,3.54219,bald nemesis burglary entertaining heist impro...,AntMan 2015


In [23]:
print(movies.columns)


Index(['movieId', 'title', 'genres', 'imdbId', 'imdb_url', 'avg_rating', 'tag',
       'clean_title'],
      dtype='object')


In [24]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres", "imdb_url"]]


In [25]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

### Evaluating the recommendation model
 This snippet finds the `Root mean square error`,`Mean absolute error`,`Precision score`,`F1 score`

In [26]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score, f1_score
import numpy as np

# Example: true and predicted ratings (for rating prediction evaluation)
y_true = [4, 3, 5, 2, 1]  # actual user ratings
y_pred = [3.5, 3, 4, 2, 1.2]  # predicted ratings by model

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')

# Example: binary relevance for top-N evaluation (1 if item relevant, 0 otherwise)
y_true_binary = [1, 0, 1, 1, 0]  # ground truth relevance
y_pred_binary = [1, 0, 1, 0, 0]  # relevance predicted by recommending top items

precision = precision_score(y_true_binary, y_pred_binary)
recall = recall_score(y_true_binary, y_pred_binary)
f1 = f1_score(y_true_binary, y_pred_binary)
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


RMSE: 0.5079370039680118
MAE: 0.33999999999999997
Precision: 1.0
Recall: 0.6666666666666666
F1 Score: 0.8
