In [1]:
#Reference: https://towardsdatascience.com/how-to-build-a-movie-recommendation-system-67e321339109
#Dataset: https://grouplens.org/datasets/movielens/


In [2]:
import pandas as pd
data = pd.read_csv('movie_ratings.csv',nrows =1000)
data.head()

Unnamed: 0,userId,movieId,rating
0,1,61,4
1,1,189,3
2,1,33,4
3,1,160,4
4,1,20,4


In [3]:
data.shape

(1000, 3)

In [4]:
data_train = data

In [5]:
data_train.head()

Unnamed: 0,userId,movieId,rating
0,1,61,4
1,1,189,3
2,1,33,4
3,1,160,4
4,1,20,4


In [6]:
import numpy as np
from scipy.sparse import csr_matrix

train_sparse_matrix = csr_matrix((data_train.rating.values, (data_train.userId.values, data_train.movieId.values)))

In [7]:
#user_movie_interaction_train = data_train.pivot(index="userId", columns="movieId", values="rating").fillna(0)

In [8]:
train_averages = dict()
# get the global average of ratings in our train set.
train_global_average = train_sparse_matrix.sum()/train_sparse_matrix.count_nonzero()
train_averages['global_avg'] = train_global_average
train_averages

{'global_avg': 3.554}

In [9]:
display(train_averages)

{'global_avg': 3.554}

In [10]:
def get_average_ratings(sparse_matrix, of_users):
    
    ax = 1 if of_users else 0 # 1 - User axes,0 - Movie axes
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1
    is_rated = sparse_matrix!=0
    no_of_ratings = is_rated.sum(axis=ax).A1
    u,m = sparse_matrix.shape
    average_ratings = { i : sum_of_ratings[i]/no_of_ratings[i] for i in range(u if of_users else m) if no_of_ratings[i] !=0}
    return average_ratings

In [11]:
train_averages['user_avg'] = get_average_ratings(train_sparse_matrix, of_users=True)

In [12]:
train_averages['movie_avg'] = get_average_ratings(train_sparse_matrix, of_users=False)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

def gettopsimuserrating(user,movie):
    
        # compute the similar Users of the "user"
        user=int(user)
        movie=int(movie)
        user_sim = cosine_similarity(train_sparse_matrix[user], train_sparse_matrix).ravel()

        #print(user_sim)
        top_sim_users = user_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
        #print(top_sim_users)
        # # get the ratings of most similar users for this movie
        top_ratings = train_sparse_matrix[top_sim_users, movie].toarray().ravel()
        # # we will make it's length "5" by adding movie averages to
        #print(top_ratings)
        top_sim_users_ratings = []
        top_sim_users_ratings.extend(top_ratings[top_ratings != 0][:5])
        top_sim_users_ratings.extend([train_averages['movie_avg'][movie]]*(5 -len(top_sim_users_ratings)))
        top_sim_users_ratings_new = []
        top_sim_users_ratings_new.append(user)
        top_sim_users_ratings_new.append(movie)
        top_sim_users_ratings_new.extend(top_sim_users_ratings)
            
        return(top_sim_users_ratings_new)

In [14]:

def gettopsimmovierating(user,movie):
        # compute the similar movies of the "movie"
        user=int(user)
        movie=int(movie)
        movie_sim = cosine_similarity(train_sparse_matrix[:,movie].T,
        train_sparse_matrix.T).ravel()
        top_sim_movies = movie_sim.argsort()[::-1][1:]
        # we are ignoring 'The User' from its similar users.
        # get the ratings of most similar movie rated by this user
        top_ratings = train_sparse_matrix[user, top_sim_movies].toarray().ravel()
        # we will make it's length "5" by adding user averages to
        top_sim_movies_ratings = []
        top_sim_movies_ratings.extend(top_ratings[top_ratings != 0][:5])
        top_sim_movies_ratings.extend([train_averages['user_avg'][user]]*(5-len(top_sim_movies_ratings)))
        top_sim_movies_ratings_new = []
        top_sim_movies_ratings_new.append(user)
        top_sim_movies_ratings_new.append(movie)
        top_sim_movies_ratings_new.extend(top_sim_movies_ratings)
        
        
        return(top_sim_movies_ratings_new)

In [15]:
print(data_train.shape[0])

1000


In [16]:
top_sim_user_rating =[]
top_sim_movie_rating = []

array_size = data_train.shape[0]

print(array_size)
for i in range(0,array_size,1):
    #print(i)
    #print(gettopsimuserrating(data_train.iloc[i].userId,data_train.iloc[i].movieId))
    top_sim_user_rating.append(gettopsimuserrating(data_train.iloc[i].userId,data_train.iloc[i].movieId))
    top_sim_movie_rating.append(gettopsimmovierating(data_train.iloc[i].userId,data_train.iloc[i].movieId))
    

1000


In [17]:
top_sim_user_rating_df = pd.DataFrame(top_sim_user_rating,columns =['userId','movieId','su1','su2','su3','su4','su5'])
top_sim_movie_rating_df = pd.DataFrame(top_sim_movie_rating,columns =['userId','movieId','sm1','sm2','sm3','sm4','sm5'])
display(top_sim_user_rating_df)

Unnamed: 0,userId,movieId,su1,su2,su3,su4,su5
0,1,61,4.0,4.0,4.000000,4.000000,4.000000
1,1,189,3.0,5.0,3.666667,3.666667,3.666667
2,1,33,4.0,4.0,4.000000,4.000000,4.000000
3,1,160,4.0,4.0,4.000000,4.000000,4.000000
4,1,20,4.0,4.0,4.000000,4.000000,4.000000
...,...,...,...,...,...,...,...
995,7,216,5.0,5.0,1.000000,3.750000,3.750000
996,7,556,3.0,3.0,3.000000,3.000000,3.000000
997,7,97,3.0,4.0,4.000000,4.000000,4.000000
998,7,135,5.0,4.0,4.000000,4.500000,4.500000


In [18]:
display(type(train_averages['global_avg']))

numpy.float64

In [19]:

data_train_modified = data_train.merge(top_sim_user_rating_df,how = 'left',on =['userId','movieId'])
data_train_modified = data_train_modified.merge(top_sim_movie_rating_df,how = 'left',on =['userId','movieId'])
data_train_modified['gobal_avg'] =train_averages['global_avg']

In [20]:
display(data_train_modified)

Unnamed: 0,userId,movieId,rating,su1,su2,su3,su4,su5,sm1,sm2,sm3,sm4,sm5,gobal_avg
0,1,61,4,4.0,4.0,4.000000,4.000000,4.000000,1,5,3,4,3,3.554
1,1,189,3,3.0,5.0,3.666667,3.666667,3.666667,3,4,4,1,5,3.554
2,1,33,4,4.0,4.0,4.000000,4.000000,4.000000,1,5,3,4,3,3.554
3,1,160,4,4.0,4.0,4.000000,4.000000,4.000000,1,5,3,4,3,3.554
4,1,20,4,4.0,4.0,4.000000,4.000000,4.000000,1,5,3,4,3,3.554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,7,216,4,5.0,5.0,1.000000,3.750000,3.750000,4,4,4,4,3,3.554
996,7,556,3,3.0,3.0,3.000000,3.000000,3.000000,3,4,2,1,3,3.554
997,7,97,5,3.0,4.0,4.000000,4.000000,4.000000,5,5,3,3,5,3.554
998,7,135,5,5.0,4.0,4.000000,4.500000,4.500000,5,5,5,5,4,3.554


In [21]:
df_user_avg_rating = pd.DataFrame.from_dict(train_averages['user_avg'],orient = 'index',columns=['user_avg_rating'])
df_user_avg_rating = df_user_avg_rating.reset_index()
df_user_avg_rating = df_user_avg_rating.rename(columns={"index": "userId"})
df_user_avg_rating

Unnamed: 0,userId,user_avg_rating
0,1,3.610294
1,2,3.709677
2,3,2.796296
3,4,4.333333
4,5,2.874286
5,6,3.635071
6,7,4.044554


In [22]:
df_movie_avg_rating = pd.DataFrame.from_dict(train_averages['movie_avg'],orient = 'index',columns=['movie_avg_rating'])
df_movie_avg_rating = df_movie_avg_rating.reset_index()
df_movie_avg_rating = df_movie_avg_rating.rename(columns={"index": "movieId"})
df_movie_avg_rating

Unnamed: 0,movieId,movie_avg_rating
0,1,4.25
1,2,3.00
2,3,4.00
3,4,3.00
4,5,3.00
...,...,...
599,675,5.00
600,676,3.00
601,680,4.00
602,681,1.00


In [23]:
data_train_modified = data_train_modified.merge(df_user_avg_rating,how = 'left',on =['userId'])
data_train_modified = data_train_modified.merge(df_movie_avg_rating,how = 'left',on =['movieId'])

In [24]:
display(data_train_modified)

Unnamed: 0,userId,movieId,rating,su1,su2,su3,su4,su5,sm1,sm2,sm3,sm4,sm5,gobal_avg,user_avg_rating,movie_avg_rating
0,1,61,4,4.0,4.0,4.000000,4.000000,4.000000,1,5,3,4,3,3.554,3.610294,4.000000
1,1,189,3,3.0,5.0,3.666667,3.666667,3.666667,3,4,4,1,5,3.554,3.610294,3.666667
2,1,33,4,4.0,4.0,4.000000,4.000000,4.000000,1,5,3,4,3,3.554,3.610294,4.000000
3,1,160,4,4.0,4.0,4.000000,4.000000,4.000000,1,5,3,4,3,3.554,3.610294,4.000000
4,1,20,4,4.0,4.0,4.000000,4.000000,4.000000,1,5,3,4,3,3.554,3.610294,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,7,216,4,5.0,5.0,1.000000,3.750000,3.750000,4,4,4,4,3,3.554,4.044554,3.750000
996,7,556,3,3.0,3.0,3.000000,3.000000,3.000000,3,4,2,1,3,3.554,4.044554,3.000000
997,7,97,5,3.0,4.0,4.000000,4.000000,4.000000,5,5,3,3,5,3.554,4.044554,4.000000
998,7,135,5,5.0,4.0,4.000000,4.500000,4.500000,5,5,5,5,4,3.554,4.044554,4.500000


In [25]:
import math
from sklearn.model_selection import train_test_split

print("Total Dataset (Before spliting): ",data_train_modified.shape)

data_train,data_test = train_test_split(data_train_modified,test_size=0.2,random_state=10)

print("data_train (80%)",data_train.shape)
print("data_test (20%)",data_test.shape)

Total Dataset (Before spliting):  (1000, 16)
data_train (80%) (800, 16)
data_test (20%) (200, 16)


In [26]:
import xgboost as xgb

# prepare train data
x_train = data_train_modified.drop(['rating'], axis=1)
y_train = data_train_modified['rating']
# initialize XGBoost model
xgb_model = xgb.XGBRegressor(silent=False, n_jobs=13,random_state=15,n_estimators=100)
# fit the model
xgb_model.fit(x_train, y_train, eval_metric = 'rmse')

Parameters: { "silent" } are not used.





In [27]:
#!pip install xgboost

In [28]:
x_test = data_test.drop(['rating'], axis=1)
y_test = data_test['rating']

In [29]:
#dictionaries for storing train and test results
test_results = dict()
# from the trained model, get the predictions
y_test_pred = xgb_model.predict(x_test)
# get the rmse and mape of train data
rmse = np.sqrt(np.mean([ (y_test.values[i] - y_test_pred[i])**2 for i in
range(len(y_test_pred)) ]))
mape = np.mean(np.abs( (y_test.values- y_test_pred)/y_test.values )) * 100
# store the results in train_results dictionary
test_results = {'rmse': rmse, 'mape' : mape, 'predictions' : y_test_pred}
print(test_results)

{'rmse': 0.0053802919321942555, 'mape': 0.0943184534708659, 'predictions': array([5.0157013 , 3.0065286 , 2.995877  , 2.9986897 , 3.9602911 ,
       4.999809  , 4.999102  , 4.0015526 , 3.9991186 , 3.998352  ,
       1.9992887 , 4.002598  , 5.0000167 , 4.999809  , 4.000992  ,
       0.9996076 , 5.00099   , 4.999809  , 1.9999657 , 1.9993558 ,
       2.9998035 , 3.0018933 , 4.9947653 , 5.0009565 , 3.999212  ,
       4.0008535 , 5.000022  , 5.001525  , 1.9986986 , 0.99956876,
       2.0004451 , 3.0034273 , 3.0034695 , 4.0015955 , 0.9998389 ,
       3.994325  , 1.0001342 , 2.9963665 , 4.9989786 , 3.9980583 ,
       2.9892235 , 4.997202  , 0.99956876, 2.9989367 , 4.9988637 ,
       4.001336  , 3.9980047 , 4.0002723 , 1.0005848 , 2.0018468 ,
       3.998789  , 1.005087  , 3.0018933 , 1.0133622 , 1.0000169 ,
       4.0151258 , 3.011898  , 3.000382  , 4.0003147 , 5.000515  ,
       4.003598  , 5.0022664 , 5.009822  , 2.0031085 , 3.0037062 ,
       4.9772334 , 2.9888096 , 3.00503   , 5.000541  ,

In [30]:
data_train

Unnamed: 0,userId,movieId,rating,su1,su2,su3,su4,su5,sm1,sm2,sm3,sm4,sm5,gobal_avg,user_avg_rating,movie_avg_rating
188,1,257,4,2.0,5.0,4.000000,3.750000,3.750000,5,5,4,4,5,3.554,3.610294,3.750000
194,1,223,5,4.0,5.0,4.666667,4.666667,4.666667,5,5,5,4,4,3.554,3.610294,4.666667
225,1,251,4,5.0,4.5,4.500000,4.500000,4.500000,2,5,5,2,4,3.554,3.610294,4.500000
580,5,210,3,4.0,4.0,3.000000,3.500000,3.500000,4,3,3,5,5,3.554,2.874286,3.500000
428,5,377,1,1.0,1.0,1.000000,1.000000,1.000000,5,3,3,3,2,3.554,2.874286,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,2,237,4,2.0,2.0,2.666667,2.666667,2.666667,5,5,4,4,4,3.554,3.709677,2.666667
527,5,408,5,4.0,4.5,4.500000,4.500000,4.500000,3,3,3,3,3,3.554,2.874286,4.500000
996,7,556,3,3.0,3.0,3.000000,3.000000,3.000000,3,4,2,1,3,3.554,4.044554,3.000000
125,1,190,5,5.0,5.0,5.000000,5.000000,5.000000,3,5,5,5,4,3.554,3.610294,5.000000


In [31]:
def get_moviereccomendation_for_user(user):

        df1 = data_train['movieId'][data_train['userId'] == user ]
        df2 = pd.DataFrame(data_train['movieId'])
        final_df = df2[~df2['movieId'].isin(df1)]
        final_df['userId'] = user
        
        
        swap_list = ["userId","movieId"]  
        
        final_df = final_df.reindex(columns=swap_list)

        top_sim_user_rating =[]
        top_sim_movie_rating = []

        array_size = final_df.shape[0]

        for i in range(0,array_size,1):
            #print(i)
            #print(gettopsimuserrating(data_train.iloc[i].userId,data_train.iloc[i].movieId))
            top_sim_user_rating.append(gettopsimuserrating(final_df.iloc[i].userId,final_df.iloc[i].movieId))
            top_sim_movie_rating.append(gettopsimmovierating(final_df.iloc[i].userId,final_df.iloc[i].movieId))

        top_sim_user_rating_df = pd.DataFrame(top_sim_user_rating,columns =['userId','movieId','su1','su2','su3','su4','su5'])
        top_sim_movie_rating_df = pd.DataFrame(top_sim_movie_rating,columns =['userId','movieId','sm1','sm2','sm3','sm4','sm5'])

        data_train_modified = final_df.merge(top_sim_user_rating_df,how = 'left',on =['userId','movieId'])
        data_train_modified = data_train_modified.merge(top_sim_movie_rating_df,how = 'left',on =['userId','movieId'])
        data_train_modified['gobal_avg'] =train_averages['global_avg']
        data_train_modified = data_train_modified.merge(df_user_avg_rating,how = 'left',on =['userId'])
        data_train_modified = data_train_modified.merge(df_movie_avg_rating,how = 'left',on =['movieId'])
       

        y_pred = xgb_model.predict(data_train_modified)
        
       # display(y_pred.argsort()[::-1])

        top_recom_movies = data_train_modified['movieId'][y_pred.argsort()[::-1]]
        top_recom_movies = top_recom_movies.unique()[:10]
        # print(type(top_recom_movies))
        # print(top_recom_movies.unique()[:10])
        return top_recom_movies

In [88]:

user=int(input("enter userid for movie"))

print("Recommendation Movie ids are:")

display(get_moviereccomendation_for_user(user))


enter userid for movie 1


Recommendation Movie ids are:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['userId'] = user


array([490, 523, 525, 474, 469, 492, 648, 661, 617, 630], dtype=int64)