In [8]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
df=pd.read_csv("/content/Movies.csv")
df.head()

Unnamed: 0,id,title,genres,language,user_score,runtime_hour,runtime_min,release_date,vote_count
0,1,The Shawshank Redemption,"Drama, Crime",en,8.7,2,22,1994-09-23,27070
1,2,The Godfather,"Drama, Crime",en,8.7,2,55,1972-03-14,20563
2,3,The Godfather Part II,"Drama, Crime",en,8.6,3,22,1974-12-20,12403
3,4,Schindler's List,"Drama, History, War",en,8.6,3,15,1993-12-15,15810
4,5,12 Angry Men,Drama,en,8.5,1,37,1957-04-10,8611


In [6]:
df.isnull().sum()

Unnamed: 0,0
id,0
title,0
genres,0
language,0
user_score,0
runtime_hour,0
runtime_min,0
release_date,0
vote_count,0


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
genre_matrix = tfidf.fit_transform(df['genres'])
cosine_sim = cosine_similarity(genre_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim,index=df['title'],columns=df['title'])
cosine_sim_df

title,The Shawshank Redemption,The Godfather,The Godfather Part II,Schindler's List,12 Angry Men,Spirited Away,The Wild Robot,Dilwale Dulhania Le Jayenge,The Dark Knight,The Green Mile,...,The Open House,Lucky Luke and the Daltons,Hellraiser: Revelations,The Star Wars Holiday Special,Battlefield Earth,Alone in the Dark,Disaster Movie,House of the Dead,Dragonball Evolution,Birdemic: Shock and Terror
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Shawshank Redemption,1.000000,1.000000,1.000000,0.153222,0.531158,0.000000,0.000000,0.248219,0.709840,0.732796,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
The Godfather,1.000000,1.000000,1.000000,0.153222,0.531158,0.000000,0.000000,0.248219,0.709840,0.732796,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
The Godfather Part II,1.000000,1.000000,1.000000,0.153222,0.531158,0.000000,0.000000,0.248219,0.709840,0.732796,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
Schindler's List,0.153222,0.153222,0.153222,1.000000,0.288467,0.000000,0.000000,0.134806,0.108763,0.112280,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
12 Angry Men,0.531158,0.531158,0.531158,0.288467,1.000000,0.000000,0.000000,0.467317,0.377037,0.389230,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Alone in the Dark,0.000000,0.000000,0.000000,0.000000,0.000000,0.359047,0.000000,0.000000,0.254952,0.434439,...,0.459957,0.000000,0.587718,0.202280,0.213266,1.000000,0.0,0.657819,0.466721,0.368878
Disaster Movie,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.518991,0.000000,0.000000,...,0.000000,0.323361,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.000000
House of the Dead,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.533665,0.000000,...,0.833941,0.000000,0.652654,0.000000,0.236829,0.657819,0.0,1.000000,0.368570,0.668808
Dragonball Evolution,0.000000,0.000000,0.000000,0.000000,0.000000,0.256001,0.430881,0.000000,0.342645,0.309755,...,0.207491,0.179259,0.000000,0.419703,0.825635,0.466721,0.0,0.368570,1.000000,0.166405


In [22]:
def recommend_movies(movie_title,df,cosine_sim):
    # Create a mapping of movie titles to their index
    indices = pd.Series(df.index, index = df['title']).drop_duplicates()

    #  # Check if the movie is in the dataset
    if movie_title not in indices:
        return "Movie not found in dataset. Please try another title."

    # return index of the movie title
    idx = indices[movie_title]

    # retrieve the index with similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # return  top 5 movies based on similarity scores
    sim_scores = sorted(sim_scores,key = lambda x : x[1], reverse=True)[1:6]

    recommendations = [df.iloc[i[0]]['title'] for i in sim_scores]
    return recommendations

In [24]:
movie_name = "The Godfather"
recommendations = recommend_movies(movie_name, df, cosine_sim)
print(f"Movies similar to '{movie_name}':\n", recommendations)

Movies similar to 'The Godfather':
 ['The Godfather', 'The Godfather Part II', 'GoodFellas', 'City of God', 'Once Upon a Time in America']
