# Movie-Movie Recommendation

#### Step1: Import libraries and dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Recommend.csv',names =['user_id','movie_id','rating','timestamp'])
df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


#### Step2: Identify total number of users and movies

In [3]:
n_users = df.user_id.unique().shape[0]
n_movies = df.movie_id.unique().shape[0]
print("total number of users ",n_users,"\ntotal number of movies",n_movies)

total number of users  943 
total number of movies 1682


#### Step3: Split the data into training and test sets

In [4]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df,test_size =0.25,random_state=27)

#### Step4: Populate train test matrices with ratings

In [5]:
train_data_matrix = np.zeros((n_users,n_movies))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1,line[2]-1] = line[3]
train_data_matrix

array([[5., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [6]:
test_data_matrix = np.zeros((n_users,n_movies))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1,line[2]-1] = line[3]
test_data_matrix

array([[0., 3., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#### Step5: Create cosine similarity matrices for users and movies

In [7]:
from sklearn.metrics import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric ='cosine')
movie_similarity = pairwise_distances(train_data_matrix.T, metric ='cosine')

#### Step6: Perform predictions

In [8]:
mean_user_rating = train_data_matrix.mean(axis = 1)[:,np.newaxis]
ratings_diff = (train_data_matrix - mean_user_rating)
user_pred = mean_user_rating + user_similarity.dot(ratings_diff)/np.array([np.abs(user_similarity).sum(axis=1)]).T
user_pred

array([[ 1.61614236,  0.58423434,  0.4929253 , ...,  0.30596566,
         0.30345992,  0.30568376],
       [ 1.33820275,  0.28414694,  0.15780769, ..., -0.05302038,
        -0.05503785, -0.05163493],
       [ 1.35890114,  0.23799129,  0.11981944, ..., -0.09829165,
        -0.09969444, -0.09630413],
       ...,
       [ 1.21291136,  0.20302763,  0.07697913, ..., -0.12672435,
        -0.12884852, -0.12560546],
       [ 1.38404663,  0.29730907,  0.19498354, ..., -0.01540342,
        -0.01747708, -0.01424371],
       [ 1.42894037,  0.36112002,  0.27918294, ...,  0.09460763,
         0.0921249 ,  0.0945299 ]])

In [9]:
movie_pred = train_data_matrix.dot(movie_similarity) / np.array([np.abs(movie_similarity).sum(axis=1)])
movie_pred

array([[0.37773097, 0.40243382, 0.41662039, ..., 0.45808687, 0.45508626,
        0.4430428 ],
       [0.0952965 , 0.11013722, 0.10808986, ..., 0.11250208, 0.11362284,
        0.11389892],
       [0.06763866, 0.07089605, 0.06992082, ..., 0.0666573 , 0.07019631,
        0.07085466],
       ...,
       [0.02617219, 0.0349138 , 0.03362847, ..., 0.0385069 , 0.03866746,
        0.0383198 ],
       [0.12403975, 0.13265031, 0.13872641, ..., 0.1432882 , 0.14396193,
        0.1442998 ],
       [0.20065287, 0.19429646, 0.21389737, ..., 0.24769822, 0.24568709,
        0.23909728]])

#### Step7: Evaluate the models based on test set with RMSE

In [10]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, original):
    prediction = prediction[original.nonzero()].flatten() 
    original = original[original.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, original))

In [11]:
print('User-based CF RMSE: ' + str(rmse(user_pred, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(movie_pred, test_data_matrix)))

User-based CF RMSE: 3.1257807732776857
Item-based CF RMSE: 3.4502840493926095


RMSE is less for User-based Collaborative Filtering. So it is better approach