In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
movies = pd.read_csv("movies.csv")

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)

In [5]:
movies["clean_title"] = movies["title"] .apply(clean_title)

In [6]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [8]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer. transform( [title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices=np.argpartition(similarity,-5)[-5:]
    results = movies. iloc[ indices] [::-1]
    return results

In [9]:
ratings=pd.read_csv("ratings.csv")

In [10]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [11]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [12]:
movie_id=1

In [13]:
similar_users=ratings[(ratings["movieId"]==movie_id)&(ratings["rating"]>=4)]["userId"].unique()

In [14]:
similar_users

array([     3,      5,      8, ..., 162530, 162533, 162534], dtype=int64)

In [15]:
similar_user_recs=ratings[(ratings["userId"].isin(similar_users))&(ratings["rating"]>4)] ["movieId"]

In [16]:
similar_user_recs

255             29
256             32
257             50
261            214
263            293
             ...  
24999248    101962
24999269    109487
24999326    164179
24999329    165549
24999348    177593
Name: movieId, Length: 2321248, dtype: int64

In [17]:
similar_user_recs.value_counts()

movieId
1         18835
318       15884
260       13870
296       13324
356       12169
          ...  
59290         1
44317         1
188811        1
188685        1
88934         1
Name: count, Length: 22464, dtype: int64

In [18]:
similar_user_recs=similar_user_recs.value_counts() / len(similar_users)
similar_user_recs=similar_user_recs[similar_user_recs > .1]

In [19]:
similar_user_recs

movieId
1       0.499483
318     0.421226
260     0.367817
296     0.353337
356     0.322708
          ...   
1148    0.103609
1527    0.102867
4995    0.102522
778     0.102495
34      0.100162
Name: count, Length: 90, dtype: float64

In [20]:
all_users=ratings[(ratings["movieId"].isin(similar_user_recs.index))&(ratings["rating"]>4)]

In [21]:
all_users_recs=all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [22]:
all_users_recs

movieId
318     0.345497
296     0.287399
2571    0.246370
356     0.237518
593     0.228071
          ...   
3114    0.054220
2716    0.053892
34      0.052729
1073    0.049232
1148    0.047922
Name: count, Length: 90, dtype: float64

In [23]:
rec_percentages=pd.concat([similar_user_recs,all_users_recs],axis=1)
rec_percentages.columns=["similar","all"]

In [24]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.499483,0.125923
318,0.421226,0.345497
260,0.367817,0.224334
296,0.353337,0.287399
356,0.322708,0.237518
...,...,...
1148,0.103609,0.047922
1527,0.102867,0.066762
4995,0.102522,0.076403
778,0.102495,0.075473


In [25]:
rec_percentages["score"]=rec_percentages["similar"]/rec_percentages["all"]

In [26]:
rec_percentages=rec_percentages.sort_values("score",ascending=False)

In [27]:
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.499483,0.125923,3.966586
3114,0.170357,0.054220,3.141967
4886,0.166645,0.071489,2.331060
6377,0.166565,0.072960,2.282977
1073,0.111591,0.049232,2.266621
...,...,...,...
58559,0.180461,0.147871,1.220392
318,0.421226,0.345497,1.219189
4973,0.136148,0.113481,1.199744
2959,0.252380,0.218792,1.153517


In [28]:
rec_percentages.head(10).merge(movies,left_index=True,right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,0.499483,0.125923,3.966586,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.170357,0.05422,3.141967,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
4780,0.166645,0.071489,2.33106,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
6258,0.166565,0.07296,2.282977,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
1047,0.111591,0.049232,2.266621,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
8246,0.154207,0.069109,2.231373,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
580,0.151449,0.068159,2.221989,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
1120,0.103609,0.047922,2.162033,1148,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,Wallace Gromit The Wrong Trousers 1993
359,0.18473,0.086585,2.133522,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994
587,0.12806,0.060551,2.1149,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991


In [29]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4)]["movieId"]

    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > 0.1]

    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >= 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]


In [None]:
while True:
    # Take input from user
    input_movie_title = input("Enter a movie title (or type 'exit' to quit): ")

    if input_movie_title.lower() == 'exit':
        break  # Exit the loop if the user enters 'exit'
    
    # Search for the movie in the dataset
    results = search(input_movie_title)

    if not results.empty:
        
        # Take the first match as input
        input_movie_id = results.iloc[0]["movieId"]

        # Get recommendations
        recommendations = find_similar_movies(input_movie_id)
        print(recommendations[["score", "title", "genres"]])
    else:
        print("Movie not found. Here are some suggestions:")
        # Suggest movies with similar titles
        suggestions = movies[movies["clean_title"].str.contains(clean_title(input_movie_title), case=False)]
        print(suggestions[["title", "genres"]])

Enter a movie title (or type 'exit' to quit):  toy story


          score                             title  \
3021   9.838737                Toy Story 2 (1999)   
2595   5.366874                     Tarzan (1999)   
2264   5.160239              Bug's Life, A (1998)   
2669   4.820735            Iron Giant, The (1999)   
3912   4.676380  Emperor's New Groove, The (2000)   
2203   4.656684                       Antz (1998)   
14813  4.511727                Toy Story 3 (2010)   
3650   4.451937                Chicken Run (2000)   
3301   4.364812          Muppet Movie, The (1979)   
1998   4.337480                  Peter Pan (1953)   

                                                 genres  
3021        Adventure|Animation|Children|Comedy|Fantasy  
2595                 Adventure|Animation|Children|Drama  
2264                Adventure|Animation|Children|Comedy  
2669          Adventure|Animation|Children|Drama|Sci-Fi  
3912        Adventure|Animation|Children|Comedy|Fantasy  
2203        Adventure|Animation|Children|Comedy|Fantasy  
14813  Adv