In [1]:
from flask import Flask, request, jsonify, render_template
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process
from sklearn.impute import KNNImputer

In [3]:
# Load the datasets
ratings = pd.read_csv('D:/doctor/ratings.csv')
movies = pd.read_csv('D:/doctor/movies.csv')


In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1.260759e+09
1,1,1029,3.0,1.260759e+09
2,1,1061,3.0,1.260759e+09
3,1,1129,2.0,1.260759e+09
4,1,1172,4.0,1.260759e+09
...,...,...,...,...
100010,2,337,5.0,
100011,1,585,4.0,
100012,1,3911,4.0,
100013,1,724,5.0,


In [7]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9120,162672,Mohenjo Daro (2016),Adventure|Drama|Romance
9121,163056,Shin Godzilla (2016),Action|Adventure|Fantasy|Sci-Fi
9122,163949,The Beatles: Eight Days a Week - The Touring Y...,Documentary
9123,164977,The Gay Desperado (1936),Comedy


In [9]:
# Create user-item matrix for user-based collaborative filtering
user_item_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')


In [11]:
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,5.0,,5.0,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,,,,,,4.0,,,,,...,,,,,,,,,,
668,,,,,,,,,,,...,,,,,,,,,,
669,,,,,,,,,,,...,,,,,,,,,,
670,4.0,,,,,,,,,,...,,,,,,,,,,


In [13]:
# Use KNNImputer to handle missing values (non-rated movies)
imputer = KNNImputer(n_neighbors=5)  # Choose 5 nearest neighbors to impute
user_item_matrix_filled = pd.DataFrame(imputer.fit_transform(user_item_matrix),
                                       index=user_item_matrix.index,
                                       columns=user_item_matrix.columns)


In [14]:
user_item_matrix_filled

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,5.0,3.8,5.0,2.9,3.5,3.6,3.8,2.4,3.6,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
2,3.8,3.0,3.5,3.4,3.5,3.3,3.2,3.8,2.6,4.0,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
3,3.5,3.5,3.8,2.6,3.6,3.6,3.0,3.8,3.2,4.0,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
4,4.8,5.0,4.0,3.1,4.1,4.0,3.2,3.8,3.6,4.0,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
5,3.4,4.0,4.0,2.7,3.2,3.7,3.0,3.8,3.2,3.8,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,4.1,4.2,3.9,3.0,3.6,4.0,3.7,3.8,2.4,3.5,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
668,4.9,3.2,3.2,2.6,3.4,3.6,3.4,3.8,2.8,4.2,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
669,4.4,3.8,3.6,3.5,3.6,3.8,3.6,3.8,2.8,3.6,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0
670,4.0,4.3,3.6,2.6,3.8,3.7,3.4,3.8,3.4,3.9,...,2.5,0.5,3.0,1.0,1.5,5.0,4.5,5.0,3.0,5.0


In [17]:
# Calculate user similarity matrix using cosine similarity
user_similarity = cosine_similarity(user_item_matrix_filled)


In [19]:
user_similarity

array([[1.        , 0.99674214, 0.99669657, ..., 0.99704017, 0.9971166 ,
        0.99676749],
       [0.99674214, 1.        , 0.99708718, ..., 0.99704426, 0.99657197,
        0.99688034],
       [0.99669657, 0.99708718, 1.        , ..., 0.9974129 , 0.99663028,
        0.99702367],
       ...,
       [0.99704017, 0.99704426, 0.9974129 , ..., 1.        , 0.99698207,
        0.99741177],
       [0.9971166 , 0.99657197, 0.99663028, ..., 0.99698207, 1.        ,
        0.99714141],
       [0.99676749, 0.99688034, 0.99702367, ..., 0.99741177, 0.99714141,
        1.        ]])

In [23]:
# Create a DataFrame to store the user similarity scores
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)


In [25]:
user_similarity_df

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.996742,0.996697,0.995528,0.996688,0.996415,0.996594,0.996780,0.997145,0.997136,...,0.996987,0.997407,0.996543,0.995726,0.996486,0.997086,0.996627,0.997040,0.997117,0.996767
2,0.996742,1.000000,0.997087,0.995596,0.996607,0.996893,0.997460,0.996982,0.997096,0.997214,...,0.997168,0.997381,0.996765,0.995770,0.996917,0.997294,0.996715,0.997044,0.996572,0.996880
3,0.996697,0.997087,1.000000,0.995641,0.996704,0.996825,0.997347,0.997139,0.997109,0.997130,...,0.996836,0.997378,0.996912,0.995518,0.997448,0.997273,0.996970,0.997413,0.996630,0.997024
4,0.995528,0.995596,0.995641,1.000000,0.996581,0.995392,0.995564,0.996782,0.996018,0.996075,...,0.995833,0.996281,0.996715,0.995251,0.994966,0.996312,0.996142,0.996452,0.995939,0.997176
5,0.996688,0.996607,0.996704,0.996581,1.000000,0.996180,0.996380,0.997188,0.996909,0.997234,...,0.997109,0.997404,0.997533,0.996578,0.995828,0.996978,0.996346,0.997139,0.996898,0.997666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.997086,0.997294,0.997273,0.996312,0.996978,0.996671,0.997308,0.997331,0.997262,0.997213,...,0.997239,0.997511,0.996955,0.996201,0.997029,1.000000,0.997164,0.997502,0.997078,0.997106
668,0.996627,0.996715,0.996970,0.996142,0.996346,0.996752,0.997188,0.996982,0.997099,0.997292,...,0.996754,0.997139,0.996682,0.995491,0.996706,0.997164,1.000000,0.997205,0.996888,0.997073
669,0.997040,0.997044,0.997413,0.996452,0.997139,0.997019,0.997509,0.997298,0.997350,0.997440,...,0.997263,0.997490,0.997301,0.996076,0.996947,0.997502,0.997205,1.000000,0.996982,0.997412
670,0.997117,0.996572,0.996630,0.995939,0.996898,0.996180,0.996526,0.997007,0.997330,0.997124,...,0.997132,0.997241,0.996666,0.995858,0.996231,0.997078,0.996888,0.996982,1.000000,0.997141


In [27]:
# Function to get user-based recommendations
def get_user_based_recommendations(user_id, num_recommendations=10):
    # Get similarity scores for the given user
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:]

    # Get the top similar user
    top_user = similar_users.index[0]

    # Get movies rated by the similar user but not rated by the target user
    user_rated_movies = user_item_matrix.loc[user_id]
    similar_user_rated_movies = user_item_matrix.loc[top_user]

    # Recommend movies that the similar user has rated highly but the target user hasn't rated
    recommendations = similar_user_rated_movies[(similar_user_rated_movies > 0) & (user_rated_movies.isna())]

    # Sort the recommendations by rating and return the top N recommendations
    recommended_movie_ids = recommendations.sort_values(ascending=False).head(num_recommendations).index
    recommended_movie_titles = movies[movies['movieId'].isin(recommended_movie_ids)]['title'].tolist()

    return recommended_movie_titles


In [33]:
get_user_based_recommendations(1)

['Forrest Gump (1994)',
 'Jurassic World (2015)',
 'Mad Max: Fury Road (2015)',
 "I'll See You in My Dreams (2015)",
 'Tomorrowland (2015)',
 'Spy (2015)',
 'Inside Out (2015)',
 'Ted 2 (2015)',
 'Minions (2015)',
 'Southpaw (2015)']

In [37]:
similar_users = user_similarity_df[1].sort_values(ascending=False)[1:]

In [39]:
similar_users

userId
663    0.997407
44     0.997389
93     0.997311
503    0.997300
437    0.997279
         ...   
213    0.991608
457    0.990605
564    0.989240
547    0.988675
15     0.981518
Name: 1, Length: 670, dtype: float64

In [41]:
top_user = similar_users.index[0]

In [43]:
top_user

663

In [45]:
user_rated_movies = user_item_matrix.loc[1]

In [47]:
user_rated_movies

movieId
1         4.0
2         5.0
3         NaN
4         5.0
5         NaN
         ... 
161944    NaN
162376    NaN
162542    NaN
162672    NaN
163949    NaN
Name: 1, Length: 9066, dtype: float64

In [49]:
similar_user_rated_movies = user_item_matrix.loc[top_user]

In [51]:
similar_user_rated_movies

movieId
1         4.0
2         NaN
3         NaN
4         NaN
5         NaN
         ... 
161944    NaN
162376    NaN
162542    NaN
162672    NaN
163949    NaN
Name: 663, Length: 9066, dtype: float64

In [53]:
 recommendations = similar_user_rated_movies[(similar_user_rated_movies > 0) & (user_rated_movies.isna())]

In [55]:
recommendations

movieId
356       4.0
480       3.5
593       4.0
1270      4.0
2571      4.0
111781    3.5
117529    4.0
120799    3.5
122882    4.0
122900    3.5
127124    4.0
128512    3.0
132046    4.0
133419    3.5
134246    3.0
134368    4.0
134528    3.5
134783    3.0
134853    4.0
135861    4.0
135887    5.0
136598    3.5
137595    3.0
138204    3.5
139642    4.0
Name: 663, dtype: float64

In [59]:
recommended_movie_ids = recommendations.sort_values(ascending=False).head(10).index
recommended_movie_titles = movies[movies['movieId'].isin(recommended_movie_ids)]['title'].tolist()


In [61]:
recommended_movie_titles

['Forrest Gump (1994)',
 'Jurassic World (2015)',
 'Mad Max: Fury Road (2015)',
 "I'll See You in My Dreams (2015)",
 'Tomorrowland (2015)',
 'Spy (2015)',
 'Inside Out (2015)',
 'Ted 2 (2015)',
 'Minions (2015)',
 'Southpaw (2015)']