# Recommendation System

* User Based recommendation system
* Item based recommendation system

In [19]:
# Import all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
# Read data file

rating = pd.read_csv('rating_2.csv')

In [22]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


#### Splitting into Train and Test

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
train, test = train_test_split(rating, test_size = 0.30, random_state=31)

In [25]:
print(train.shape)
print(test.shape)

(197272, 4)
(84546, 4)


* Pivot rating DF into movie features

In [26]:
df_movie_features = df_movie_features = train.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

In [27]:
df_movie_features.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,205383,205557,205583,205945,205967,206272,206293,206499,206523,208002
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Copy Train and Test Dataset
These dataset will be used for prediction and evaluation
- Dummy train will be used later for prediction of the movies which has not been rated by the user. To ignore the movies rated by the user, we will mark it as 0 during prediction. The movies not rated by user is marked as 1 for prediction. 
- Dummy test will be used for evaluation. To evaluate, we will only make prediction on the movies rated by the user. So, this is marked as 1. This is just opposite of dummy_train

In [28]:
dummy_train = train.copy()
dummy_test = test.copy()

In [29]:
dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x>=1 else 1)
dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x>=1 else 0)

In [30]:
# Pivot the dataset with movie_id as features
# movies not rated by user is marked 1 for predictions
dummy_train = dummy_train.pivot(
index = 'userId',
columns = 'movieId',
values = 'rating'
).fillna(0)

# movie not rated by users is marked as 0 for evaluation
dummy_test = dummy_test.pivot(
index = 'userId',
columns = 'movieId',
values = 'rating'
).fillna(0)


### User Similarity Matrix

#### Using cosine similarity

In [31]:
from sklearn.metrics.pairwise import pairwise_distances

# user similarity matrix
user_correlation = 1 - pairwise_distances(df_movie_features, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0  #replacing all 0 with nan to use cosine similarity
print(user_correlation)

[[1.         0.02940385 0.03894212 ... 0.02836651 0.03696386 0.03097134]
 [0.02940385 1.         0.11462246 ... 0.12092187 0.12297888 0.02347127]
 [0.03894212 0.11462246 1.         ... 0.17572732 0.12084393 0.04657827]
 ...
 [0.02836651 0.12092187 0.17572732 ... 1.         0.07356633 0.09380764]
 [0.03696386 0.12297888 0.12084393 ... 0.07356633 1.         0.1150264 ]
 [0.03097134 0.02347127 0.04657827 ... 0.09380764 0.1150264  1.        ]]


In [33]:
user_correlation.shape

(1973, 1973)

## Using adjusted Cosine 

### Here, not removing the NaN values and calculating the mean only for the movies rated by the user

In [34]:
movie_features = train.pivot(
    index = 'userId',
    columns = 'movieId',
    values = 'rating'
)

In [35]:
movie_features.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,205383,205557,205583,205945,205967,206272,206293,206499,206523,208002
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,3.5,,,,,,,,,,...,,,,,,,,,,
3,4.0,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


### Normalising the rating of the movie for each user around 0 mean

In [36]:
# nanmean ignores the nan and take mean
mean = np.nanmean(movie_features, axis = 1)

In [38]:
df_subtracted = (movie_features.T-mean).T

In [40]:
df_subtracted.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,205383,205557,205583,205945,205967,206272,206293,206499,206523,208002
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,-0.133333,,,,,,,,,,...,,,,,,,,,,
3,0.298265,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


- <b>Using adjusted cosine similarity

In [42]:
user_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

[[ 1.          0.00213971  0.01010749 ...  0.00282206  0.0223776
  -0.00403246]
 [ 0.00213971  1.          0.03282722 ...  0.03027474  0.00746344
  -0.00272776]
 [ 0.01010749  0.03282722  1.         ...  0.03981439  0.06622814
   0.06778323]
 ...
 [ 0.00282206  0.03027474  0.03981439 ...  1.          0.01284391
  -0.00914289]
 [ 0.0223776   0.00746344  0.06622814 ...  0.01284391  1.
   0.04125641]
 [-0.00403246 -0.00272776  0.06778323 ... -0.00914289  0.04125641
   1.        ]]


#### Writing user similarity matrix to csv files

In [45]:
user_df = pd.DataFrame(user_correlation)

In [48]:
user_csv = user_df.to_csv('user_df.csv')

### Predictions

Doing predictions only for positively related users as we are interested in the users which are more similar to the current users. So, ignoring the correlation for values less than 0.

In [52]:
user_correlation[user_correlation<0] = 0
user_correlation

array([[1.        , 0.00213971, 0.01010749, ..., 0.00282206, 0.0223776 ,
        0.        ],
       [0.00213971, 1.        , 0.03282722, ..., 0.03027474, 0.00746344,
        0.        ],
       [0.01010749, 0.03282722, 1.        , ..., 0.03981439, 0.06622814,
        0.06778323],
       ...,
       [0.00282206, 0.03027474, 0.03981439, ..., 1.        , 0.01284391,
        0.        ],
       [0.0223776 , 0.00746344, 0.06622814, ..., 0.01284391, 1.        ,
        0.04125641],
       [0.        , 0.        , 0.06778323, ..., 0.        , 0.04125641,
        1.        ]])

#### Prediction
- Rating predicted by the user (for movies rated as well as not rated) is the weighted sum of correlation with the movie rating (as present in the rating dataset). 

In [53]:
user_rating_predicted = np.dot(user_correlation, movie_features.fillna(0))
user_rating_predicted

array([[8.35871697e+00, 2.72060148e+00, 8.33865435e-01, ...,
        1.11195638e-02, 2.03729887e-02, 0.00000000e+00],
       [4.24906518e+01, 1.68306680e+01, 6.67203239e+00, ...,
        1.41530704e-03, 8.56663232e-02, 2.63084098e-02],
       [7.06951153e+01, 2.23687068e+01, 8.72181059e+00, ...,
        1.28032474e-01, 1.07878520e-01, 4.09137034e-01],
       ...,
       [4.08970619e+01, 1.27985116e+01, 6.91865170e+00, ...,
        1.98729960e-01, 5.40591204e-02, 1.43994152e-01],
       [7.50027807e+01, 3.21254364e+01, 1.19463039e+01, ...,
        1.08379375e-01, 7.89957295e-02, 1.10781629e-01],
       [7.47353776e+01, 2.18543415e+01, 9.76395763e+00, ...,
        4.81933017e-02, 9.33574524e-02, 2.20562902e-01]])

In [55]:
user_rating_predicted.shape

(1973, 12593)

#### Since we are interested only in the movies not rated by the user, we will ignore the movies rated by the user by making it zero. 

In [59]:
user_final_rating = np.multiply(user_rating_predicted,dummy_train)
user_final_rating.tail()

movieId,1,2,3,4,5,6,7,8,9,10,...,205383,205557,205583,205945,205967,206272,206293,206499,206523,208002
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Finding the top 5 recommendation for the user 1 

In [66]:
user_final_rating.iloc[9].sort_values(ascending=False)[0:5]

movieId
208002    0.0
5265      0.0
5262      0.0
5258      0.0
5256      0.0
Name: 10, dtype: float64

### Using Item Based similarity

Taking the transpose of the rating matrix to normalize the rating around the mean for different movie ID. In the user based similarity, we had taken mean for each user intead of each movie. 

In [63]:
movie_features = train.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).T

movie_features.head()

userId,1,2,3,4,5,6,7,8,9,10,...,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.5,4.0,,,,,4.0,,3.5,...,,,,,,,,,,4.5
2,,,,,,,,,5.0,,...,,,,,,,,,,4.0
3,,,,,,,,4.0,,,...,,,,,,2.5,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


Normalising the movie rating for each movie

In [68]:
mean = np.nanmean(movie_features, axis=1)
df_subtracted = (movie_features.T-mean).T

In [69]:
df_subtracted.head()

userId,1,2,3,4,5,6,7,8,9,10,...,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,-0.465479,0.034521,,,,,0.034521,,-0.465479,...,,,,,,,,,,0.534521
2,,,,,,,,,1.607143,,...,,,,,,,,,,0.607143
3,,,,,,,,0.759091,,,...,,,,,,-0.740909,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [70]:
# Item Similarity Matrix
item_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation)

[[1.         0.15035478 0.01043228 ... 0.01891945 0.         0.        ]
 [0.15035478 1.         0.01909193 ... 0.         0.         0.        ]
 [0.01043228 0.01909193 1.         ... 0.         0.         0.        ]
 ...
 [0.01891945 0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [71]:
item_correlation[item_correlation<0]=0
item_correlation

array([[1.        , 0.15035478, 0.01043228, ..., 0.01891945, 0.        ,
        0.        ],
       [0.15035478, 1.        , 0.01909193, ..., 0.        , 0.        ,
        0.        ],
       [0.01043228, 0.01909193, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.01891945, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

### Prediction

In [72]:
item_predicted_ratings = np.dot((movie_features.fillna(0).T),item_correlation)
item_predicted_ratings

array([[ 6.03684624,  3.76159469,  1.03496589, ...,  3.56466243,
         0.        ,  0.        ],
       [30.05424681, 20.26576066,  6.98269815, ..., 14.59246732,
         0.        ,  0.        ],
       [91.75727631, 49.28543978, 13.51594484, ..., 34.83784703,
         0.        ,  0.        ],
       ...,
       [ 7.54894327,  4.8306709 ,  1.5054678 , ...,  2.94680187,
         0.        ,  0.        ],
       [22.71367484, 24.048337  ,  9.94327945, ..., 10.21935707,
         0.        ,  0.        ],
       [ 8.06699575,  8.95365918,  1.25107467, ...,  1.4675771 ,
         0.        ,  0.        ]])

In [73]:
item_predicted_ratings.shape

(1973, 12593)

### Filtering the rating only for the movies not rated by the user for recommendation

In [74]:
item_final_rating = np.multiply(item_predicted_ratings,dummy_train)
item_final_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,205383,205557,205583,205945,205967,206272,206293,206499,206523,208002
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Top 5 prediction for the user -1

In [76]:
item_final_rating.iloc[1].sort_values(ascending=False)[0:5]

movieId
2987     19.868954
7162     17.775129
2761     17.311838
33166    16.586710
2643     12.577654
Name: 2, dtype: float64

### Evaluation

Evaluation will we same as prediction. The only difference being, you will evaluate for the movie already rated by the user insead of predicting it for the movie not rated by the user.

#### Using user similarity

In [78]:
test_movie_features = test.pivot(
    index = 'userId',
    columns = 'movieId',
    values = 'rating'
)

# Normalize the mean
mean = np.nanmean(test_movie_features, axis=1)
test_df_subtracted = (test_movie_features.T-mean).T

# User Similarity Matrix
test_user_correlation = 1 - pairwise_distances(test_df_subtracted.fillna(0), metric='cosine')
test_user_correlation[np.isnan(test_user_correlation)] = 0
print(test_user_correlation)

[[ 1.00000000e+00  0.00000000e+00 -9.95958271e-04 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  1.00000000e+00  1.35368569e-02 ...  3.98808620e-02
  -1.95190259e-02 -1.63367938e-02]
 [-9.95958271e-04  1.35368569e-02  1.00000000e+00 ...  4.64663812e-03
   3.05348098e-03  0.00000000e+00]
 ...
 [ 0.00000000e+00  3.98808620e-02  4.64663812e-03 ...  1.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00 -1.95190259e-02  3.05348098e-03 ...  0.00000000e+00
   1.00000000e+00 -6.08517327e-02]
 [ 0.00000000e+00 -1.63367938e-02  0.00000000e+00 ...  0.00000000e+00
  -6.08517327e-02  1.00000000e+00]]


In [79]:
# ignoring the negative correlated users
# prediction using dot product
test_user_correlation[test_user_correlation<0]=0
test_user_predicted_ratings = np.dot(test_user_correlation, test_movie_features.fillna(0))
test_user_predicted_ratings

array([[ 7.12854149,  1.91732867,  0.49992045, ...,  0.06291183,
         0.        ,  0.        ],
       [13.99473249,  2.70171929,  1.43279984, ...,  0.08129167,
         0.13030956,  0.11865365],
       [10.5885237 ,  3.19207881,  0.94329146, ...,  0.02926175,
         0.20803279,  0.16702967],
       ...,
       [ 6.20328489,  1.13176188,  0.17183771, ...,  0.03893108,
         0.        ,  0.08615834],
       [11.81007687,  3.83212405,  2.1485903 , ...,  0.0366316 ,
         0.        ,  0.04377962],
       [ 7.97538226,  3.50616658,  1.16646413, ...,  0.        ,
         0.        ,  0.        ]])

### Doing prediction for the movies rated by the user

In [80]:
test_user_final_rating = np.multiply(test_user_predicted_ratings,dummy_test)

In [81]:
test_user_final_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,204832,205054,205156,205413,205499,206499,206805,206861,207309,208793
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10.341004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,20.883618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Calculating the RMSE for only the movies rated by user. For  RMSE, normalising the rating to (1,5) range

In [82]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = test_user_final_rating.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


MinMaxScaler(copy=True, feature_range=(1, 5))
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]


In [83]:
test_ = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

In [84]:
# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

In [85]:
rmse = (sum(sum((test_ - y )**2))/total_non_nan)**0.5
print(rmse)

1.3948697387296352


### Using Item similarity

In [86]:
test_movie_features = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).T

mean = np.nanmean(test_movie_features, axis=1)
test_df_subtracted = (test_movie_features.T-mean).T

test_item_correlation = 1 - pairwise_distances(test_df_subtracted.fillna(0), metric='cosine')
test_item_correlation[np.isnan(test_item_correlation)] = 0
test_item_correlation[test_item_correlation<0]=0

In [87]:
test_item_predicted_ratings = (np.dot(test_item_correlation, test_movie_features.fillna(0))).T
test_item_final_rating = np.multiply(test_item_predicted_ratings,dummy_test)
test_item_final_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,204832,205054,205156,205413,205499,206499,206805,206861,207309,208793
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6.148358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5.605992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
test_ = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

In [89]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = test_item_final_rating.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))


test_ = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


MinMaxScaler(copy=True, feature_range=(1, 5))


### Finding RMSE

In [90]:
rmse = (sum(sum((test_ - y )**2))/total_non_nan)**0.5
print(rmse)

2.1949474681155445
