In [1]:
import pandas as pd
import numpy as np

### Importing Dataset

In [80]:
df=pd.read_csv("ratings.csv")
df.head(10)

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
5,1,1088,4.0
6,1,1175,3.5
7,1,1217,3.5
8,1,1237,5.0
9,1,1250,4.0


### Data-Explorations

In [81]:
print('Number of Unique Users are {}'.format(len(df['userId'].unique())))

Number of Unique Users are 7045


In [82]:
print('Number of Unique Movies are {}'.format(len(df['movieId'].unique())))

Number of Unique Movies are 22240


In [83]:
df['rating'].value_counts()

4.0    280291
3.0    207514
5.0    152646
3.5    130755
4.5     90051
2.0     69617
2.5     53093
1.0     31975
1.5     17494
0.5     15139
Name: rating, dtype: int64

In [78]:
df['userId'].unique()

array([     1,      2,      3, ..., 162539, 162540, 162541], dtype=int64)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [79]:
df.rating.describe()

count    2.500010e+07
mean     3.533854e+00
std      1.060744e+00
min      5.000000e-01
25%      3.000000e+00
50%      3.500000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [13]:
df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,25000100.0,25000100.0,25000100.0,25000100.0
mean,81189.28,21387.98,3.533854,1215601000.0
std,46791.72,39198.86,1.060744,226875800.0
min,1.0,1.0,0.5,789652000.0
25%,40510.0,1196.0,3.0,1011747000.0
50%,80914.0,2947.0,3.5,1198868000.0
75%,121557.0,8623.0,4.0,1447205000.0
max,162541.0,209171.0,5.0,1574328000.0


In [15]:
df.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [26]:
df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [28]:
df.duplicated().sum()

0

In [29]:
df[df.duplicated()].shape

(0, 4)

In [31]:
duplicates = df['userId'].duplicated()
duplicates

0           False
1            True
2            True
3            True
4            True
            ...  
25000090     True
25000091     True
25000092     True
25000093     True
25000094     True
Name: userId, Length: 25000095, dtype: bool

In [52]:
df.groupby('movieId')['rating'].mean().sort_values(ascending=False).head()

movieId
136782    5.0
186119    5.0
137032    5.0
184643    5.0
137038    5.0
Name: rating, dtype: float64

In [53]:
df.groupby('movieId')['rating'].count().sort_values(ascending=False).head(10)

movieId
356     81491
318     81482
296     79672
593     74127
2571    72674
260     68717
480     64144
527     60411
110     59184
2959    58773
Name: rating, dtype: int64

### Pre-Processings

In [85]:
df1=df.copy()
df1.dropna(inplace=True)
df1.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


### # Calculating Cosine Similarity between Users

In [86]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation, jaccard

In [87]:
user_movies_df = df1.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
user_movies_df

movieId,1,2,3,4,5,6,7,8,9,10,...,207642,207830,207890,208002,208080,208112,208737,208793,208939,209163
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7041,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7042,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
user_sim = 1 - pairwise_distances(user_movies_df.values, metric = 'cosine')
user_sim

array([[1.        , 0.04086293, 0.06130627, ..., 0.04297792, 0.02148534,
        0.04277107],
       [0.04086293, 1.        , 0.17900861, ..., 0.09032635, 0.19037477,
        0.37009439],
       [0.06130627, 0.17900861, 1.        , ..., 0.06088   , 0.07479442,
        0.12863813],
       ...,
       [0.04297792, 0.09032635, 0.06088   , ..., 1.        , 0.06458488,
        0.11753893],
       [0.02148534, 0.19037477, 0.07479442, ..., 0.06458488, 1.        ,
        0.16432522],
       [0.04277107, 0.37009439, 0.12863813, ..., 0.11753893, 0.16432522,
        1.        ]])

In [89]:
# Store the results in a dataframe
user_sim_df = pd.DataFrame(user_sim)
user_sim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7035,7036,7037,7038,7039,7040,7041,7042,7043,7044
0,1.000000,0.040863,0.061306,0.040815,0.015609,0.000000,0.093852,0.021359,0.022860,0.026149,...,0.066350,0.020300,0.032775,0.080646,0.000000,0.076503,0.018670,0.042978,0.021485,0.042771
1,0.040863,1.000000,0.179009,0.197496,0.158202,0.129720,0.064954,0.176880,0.128820,0.156814,...,0.234043,0.108868,0.107612,0.380926,0.168061,0.221649,0.155730,0.090326,0.190375,0.370094
2,0.061306,0.179009,1.000000,0.357750,0.061448,0.115148,0.031427,0.081003,0.061602,0.132361,...,0.467198,0.073502,0.395024,0.240655,0.123241,0.134694,0.084734,0.060880,0.074794,0.128638
3,0.040815,0.197496,0.357750,1.000000,0.065825,0.072365,0.015566,0.088927,0.066428,0.083821,...,0.322546,0.038947,0.318556,0.187358,0.059653,0.136363,0.049125,0.040988,0.109771,0.141168
4,0.015609,0.158202,0.061448,0.065825,1.000000,0.114936,0.202348,0.307769,0.216435,0.269897,...,0.130110,0.085107,0.023003,0.180271,0.149147,0.194882,0.239160,0.271511,0.047564,0.228955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7040,0.076503,0.221649,0.134694,0.136363,0.194882,0.212102,0.180488,0.145485,0.152821,0.176645,...,0.167885,0.136723,0.102221,0.301522,0.159873,1.000000,0.149934,0.105072,0.091330,0.292095
7041,0.018670,0.155730,0.084734,0.049125,0.239160,0.102047,0.262403,0.241166,0.128034,0.297456,...,0.128121,0.094988,0.040009,0.186798,0.222852,0.149934,1.000000,0.328257,0.020041,0.215298
7042,0.042978,0.090326,0.060880,0.040988,0.271511,0.067861,0.277812,0.345022,0.158205,0.355405,...,0.142028,0.092548,0.018096,0.150385,0.290331,0.105072,0.328257,1.000000,0.064585,0.117539
7043,0.021485,0.190375,0.074794,0.109771,0.047564,0.024465,0.000000,0.040527,0.040556,0.034929,...,0.119149,0.087914,0.059571,0.160005,0.072184,0.091330,0.020041,0.064585,1.000000,0.164325


In [90]:
user_sim_df.iloc[:5,:5]

Unnamed: 0,0,1,2,3,4
0,1.0,0.040863,0.061306,0.040815,0.015609
1,0.040863,1.0,0.179009,0.197496,0.158202
2,0.061306,0.179009,1.0,0.35775,0.061448
3,0.040815,0.197496,0.35775,1.0,0.065825
4,0.015609,0.158202,0.061448,0.065825,1.0


In [91]:
np.fill_diagonal(user_sim, 0)
user_sim_df.iloc[0:5,0:5]

Unnamed: 0,0,1,2,3,4
0,0.0,0.040863,0.061306,0.040815,0.015609
1,0.040863,0.0,0.179009,0.197496,0.158202
2,0.061306,0.179009,0.0,0.35775,0.061448
3,0.040815,0.197496,0.35775,0.0,0.065825
4,0.015609,0.158202,0.061448,0.065825,0.0


In [92]:
# Set the index and column name to user Ids
user_sim_df.index = list(user_movies_df.index)
user_sim_df.columns = list(user_movies_df.index)
user_sim_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,7036,7037,7038,7039,7040,7041,7042,7043,7044,7045
1,0.000000,0.040863,0.061306,0.040815,0.015609,0.000000,0.093852,0.021359,0.022860,0.026149,...,0.066350,0.020300,0.032775,0.080646,0.000000,0.076503,0.018670,0.042978,0.021485,0.042771
2,0.040863,0.000000,0.179009,0.197496,0.158202,0.129720,0.064954,0.176880,0.128820,0.156814,...,0.234043,0.108868,0.107612,0.380926,0.168061,0.221649,0.155730,0.090326,0.190375,0.370094
3,0.061306,0.179009,0.000000,0.357750,0.061448,0.115148,0.031427,0.081003,0.061602,0.132361,...,0.467198,0.073502,0.395024,0.240655,0.123241,0.134694,0.084734,0.060880,0.074794,0.128638
4,0.040815,0.197496,0.357750,0.000000,0.065825,0.072365,0.015566,0.088927,0.066428,0.083821,...,0.322546,0.038947,0.318556,0.187358,0.059653,0.136363,0.049125,0.040988,0.109771,0.141168
5,0.015609,0.158202,0.061448,0.065825,0.000000,0.114936,0.202348,0.307769,0.216435,0.269897,...,0.130110,0.085107,0.023003,0.180271,0.149147,0.194882,0.239160,0.271511,0.047564,0.228955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7041,0.076503,0.221649,0.134694,0.136363,0.194882,0.212102,0.180488,0.145485,0.152821,0.176645,...,0.167885,0.136723,0.102221,0.301522,0.159873,0.000000,0.149934,0.105072,0.091330,0.292095
7042,0.018670,0.155730,0.084734,0.049125,0.239160,0.102047,0.262403,0.241166,0.128034,0.297456,...,0.128121,0.094988,0.040009,0.186798,0.222852,0.149934,0.000000,0.328257,0.020041,0.215298
7043,0.042978,0.090326,0.060880,0.040988,0.271511,0.067861,0.277812,0.345022,0.158205,0.355405,...,0.142028,0.092548,0.018096,0.150385,0.290331,0.105072,0.328257,0.000000,0.064585,0.117539
7044,0.021485,0.190375,0.074794,0.109771,0.047564,0.024465,0.000000,0.040527,0.040556,0.034929,...,0.119149,0.087914,0.059571,0.160005,0.072184,0.091330,0.020041,0.064585,0.000000,0.164325


### Recommendation Function

In [101]:
def give_recommendation(user_id):
    item = list(user_sim_df.sort_values([user_id],ascending=False).head(100).index)
    movie_list=[]
    for i in item:
        movie_list=movie_list+list(df[df['userId']==i]['movieId'])
    return set(movie_list)-set(df[df['userId']==user_id]['movieId'])

In [103]:
user_id = input("Please Enter the UserId to get the recommended Movie List\n")
give_recommendation(int(user_id))

Please Enter the UserId to get the recommended Movie List
2


{122882,
 2,
 3,
 5,
 6,
 122886,
 7,
 32770,
 10,
 11,
 9,
 122890,
 12,
 8207,
 16,
 17,
 18,
 19,
 8208,
 21,
 22,
 15,
 57368,
 24,
 26,
 122904,
 25,
 20,
 23,
 31,
 32,
 122912,
 34,
 35,
 36,
 8228,
 38,
 39,
 122920,
 41,
 122922,
 8,
 44,
 122918,
 46,
 47,
 48,
 42,
 50,
 65588,
 54,
 58,
 60,
 8253,
 63,
 65,
 41025,
 8259,
 49220,
 69,
 70,
 66,
 73,
 74,
 65612,
 8268,
 76,
 79,
 85,
 86,
 87,
 88,
 93,
 94,
 95,
 101,
 104,
 105,
 107,
 111,
 112,
 49272,
 139385,
 49274,
 121,
 122,
 49278,
 73854,
 49280,
 135,
 27,
 8330,
 140,
 141,
 28,
 144,
 145,
 146,
 147,
 8338,
 8340,
 29,
 153,
 8341,
 155,
 157,
 158,
 57504,
 161,
 160,
 163,
 162,
 165,
 164,
 168,
 170,
 8361,
 169,
 8362,
 172,
 8366,
 173,
 176,
 8371,
 8372,
 8373,
 57526,
 181,
 8376,
 185,
 186,
 180,
 57528,
 8378,
 188,
 193,
 194,
 8387,
 196,
 49347,
 198,
 106696,
 203,
 204,
 207,
 208,
 209,
 8401,
 212,
 8405,
 215,
 216,
 219,
 222,
 223,
 224,
 225,
 227,
 229,
 230,
 231,
 232,
 234,
 235,
