### Importing all the libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix


Since I comlpeted preprocessing of data I will just Load the dataset 

In [2]:
df = pd.read_csv('movies_finalized_dataset.csv')

In [3]:
df

Unnamed: 0,movieId,title,genres,imdbId,imdb_url,avg_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,https://www.imdb.com/title/tt0114709/,3.897438
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,https://www.imdb.com/title/tt0113497/,3.275758
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,https://www.imdb.com/title/tt0113228/,3.139447
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,https://www.imdb.com/title/tt0114885/,2.845331
4,5,Father of the Bride Part II (1995),Comedy,113041,https://www.imdb.com/title/tt0113041/,3.059602
...,...,...,...,...,...,...
87580,292731,The Monroy Affaire (2022),Drama,26812510,https://www.imdb.com/title/tt26812510/,4.000000
87581,292737,Shelter in Solitude (2023),Comedy|Drama,14907358,https://www.imdb.com/title/tt14907358/,1.500000
87582,292753,Orca (2023),Drama,12388280,https://www.imdb.com/title/tt12388280/,4.000000
87583,292755,The Angry Breed (1968),Drama,64027,https://www.imdb.com/title/tt0064027/,1.000000


In [4]:
df.describe()

Unnamed: 0,movieId,imdbId,avg_rating
count,87585.0,87585.0,87585.0
mean,157651.365519,2792840.0,3.005082
std,79013.402099,4278866.0,0.787255
min,1.0,1.0,0.5
25%,112657.0,94642.0,2.595652
50%,165741.0,492996.0,3.005082
75%,213203.0,3877296.0,3.5
max,292757.0,29081100.0,5.0


### Defining clean title

In [5]:
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title   

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   movieId     87585 non-null  int64  
 1   title       87585 non-null  object 
 2   genres      87585 non-null  object 
 3   imdbId      87585 non-null  int64  
 4   imdb_url    87585 non-null  object 
 5   avg_rating  87585 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 4.0+ MB


In [7]:
df['clean_title'] = df['title'].apply(clean_title)

In [8]:
df

Unnamed: 0,movieId,title,genres,imdbId,imdb_url,avg_rating,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,https://www.imdb.com/title/tt0114709/,3.897438,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,https://www.imdb.com/title/tt0113497/,3.275758,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,https://www.imdb.com/title/tt0113228/,3.139447,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,https://www.imdb.com/title/tt0114885/,2.845331,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,113041,https://www.imdb.com/title/tt0113041/,3.059602,Father of the Bride Part II 1995
...,...,...,...,...,...,...,...
87580,292731,The Monroy Affaire (2022),Drama,26812510,https://www.imdb.com/title/tt26812510/,4.000000,The Monroy Affaire 2022
87581,292737,Shelter in Solitude (2023),Comedy|Drama,14907358,https://www.imdb.com/title/tt14907358/,1.500000,Shelter in Solitude 2023
87582,292753,Orca (2023),Drama,12388280,https://www.imdb.com/title/tt12388280/,4.000000,Orca 2023
87583,292755,The Angry Breed (1968),Drama,64027,https://www.imdb.com/title/tt0064027/,1.000000,The Angry Breed 1968


### Building model using TfidVectorizer
This is used to convert a collection of raw text documents into matrix of numerical values

In [9]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(df["clean_title"])

In [10]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = df.iloc[indices].iloc[::-1]
    
    return results

In [11]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [12]:

movie_id = 89598

#def find_similar_movies(movie_id):
movie = df[df["movieId"] == movie_id]

In [13]:

ratings = pd.read_csv("ratings.csv")

In [14]:

ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [15]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [16]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [17]:

similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [18]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [19]:

all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [20]:

rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [21]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
7147,1.0,0.037570
7505,1.0,0.001184
196287,1.0,0.000011
89598,1.0,0.000011
1354,1.0,0.008963
...,...,...
202737,0.5,0.000006
203054,0.5,0.000033
203312,0.5,0.000017
204698,0.5,0.024982


In [22]:

rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [23]:

rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [24]:
rec_percentages.head(10).merge(df, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,imdbId,imdb_url,avg_rating,clean_title
39324,0.5,6e-06,89926.0,156079,April (1961),Comedy|Drama|Romance,247184,https://www.imdb.com/title/tt0247184/,3.083333,April 1961
48913,0.5,6e-06,89926.0,176517,Venus (2016),Documentary,6178628,https://www.imdb.com/title/tt6178628/,2.8,Venus 2016
16908,0.5,6e-06,89926.0,89007,Recollections of the Yellow House (Recordações...,Comedy|Drama,98175,https://www.imdb.com/title/tt0098175/,3.916667,Recollections of the Yellow House Recordaes da...
19140,0.5,6e-06,89926.0,99671,Fragments of an Alms-Film (Fragmentos de um Fi...,Comedy,68606,https://www.imdb.com/title/tt0068606/,2.875,Fragments of an AlmsFilm Fragmentos de um Film...
16816,0.5,6e-06,89926.0,88562,Our Beloved Month of August (Aquele Querido Mê...,Romance,1081929,https://www.imdb.com/title/tt1081929/,3.153846,Our Beloved Month of August Aquele Querido Ms ...
45194,0.5,6e-06,89926.0,168764,Blood Curse (2006),Horror|Mystery,489461,https://www.imdb.com/title/tt0489461/,2.833333,Blood Curse 2006
33200,0.5,6e-06,89926.0,142090,Noite Escura (2004),Crime|Drama,413053,https://www.imdb.com/title/tt0413053/,3.5,Noite Escura 2004
31598,0.5,6e-06,89926.0,138480,We Can Do That (2008),Comedy,1320297,https://www.imdb.com/title/tt1320297/,3.416667,We Can Do That 2008
42908,0.5,6e-06,89926.0,163917,São Jorge (2016),(no genres listed),4895668,https://www.imdb.com/title/tt4895668/,3.285714,So Jorge 2016
70839,0.5,6e-06,89926.0,228179,Winter Song (2015),Comedy|Drama,4880956,https://www.imdb.com/title/tt4880956/,4.5,Winter Song 2015


In [25]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    rec_percentages = rec_percentages.reset_index()
    
    recommendations = rec_percentages.head(10).merge(movies, on="movieId")
    
    # Add IMDb URLs assuming movies DataFrame has imdbId column
    recommendations["imdb_url"] = "https://www.imdb.com/title/tt" + recommendations["imdbId"].astype(str).str.zfill(7)
    
    return recommendations[["score", "title", "genres", "imdb_url"]]


In [27]:
movies = pd.read_csv("movies_finalized_dataset.csv")
ratings = pd.read_csv("ratings.csv")

In [28]:

import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            recommended_movies = find_similar_movies(movie_id)  # Call your recommendation logic here
            display(recommended_movies)  # Display the recommended movies DataFrame

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title:')

Output()