In [1]:
import pandas as pd
from surprise import Reader,Dataset,SVD
from surprise.accuracy import rmse,mae
from surprise.model_selection import cross_validate

In [2]:
df = pd.read_csv("ratings.csv")

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
df.drop('timestamp',axis=1,inplace=True)

In [5]:
df.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

In [6]:
n_users = df['userId'].nunique()
n_movies = df['movieId'].nunique()
print("Number of unique movies: {0}".format(n_movies))
print("Number of unique users: {0}".format(n_users))

Number of unique movies: 9724
Number of unique users: 610


In [7]:
available_ratings = df['rating'].count()
total_ratings = n_users * n_movies
missing_ratings = total_ratings-available_ratings
sparsity = (missing_ratings/total_ratings)*100
print("Sparsity: {0}".format(sparsity))

Sparsity: 98.30003169443864


In [8]:
df['rating'].value_counts().plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x2bce98cff28>

In [9]:
filter_movies = df['movieId'].value_counts()>3
filter_movies = filter_movies[filter_movies].index.tolist()

In [10]:
filter_users = df['userId'].value_counts()>3
filter_users = filter_users[filter_users].index.tolist()

In [11]:
print("Original Shape : {0}".format(df.shape))
df = df[(df['movieId'].isin(filter_movies)) & (df['userId'].isin(filter_users))]
print("New Shape : {0}".format(df.shape))

Original Shape : (100836, 3)
New Shape : (92394, 3)


In [12]:
cols = ['userId','movieId','rating']

In [13]:
reader = Reader(rating_scale=(0.5,5))
data = Dataset.load_from_df(df[cols],reader)

In [14]:
trainset = data.build_full_trainset()
antiset = trainset.build_anti_testset()

In [15]:
algo = SVD(n_epochs=25,verbose=True)

In [16]:
cross_validate(algo,data,measures=['RMSE','MAE'],cv=5,verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 0
P

{'test_rmse': array([0.85518794, 0.86526486, 0.86938551, 0.86321849, 0.85070841]),
 'test_mae': array([0.65556885, 0.66452988, 0.66614659, 0.66104086, 0.65536066]),
 'fit_time': (4.695468425750732,
  4.773273468017578,
  4.933805227279663,
  4.957791566848755,
  5.34475302696228),
 'test_time': (0.15658116340637207,
  0.10370969772338867,
  0.18849611282348633,
  0.10569906234741211,
  0.13663601875305176)}

In [17]:
predictions = algo.test(antiset)

In [18]:
predictions[0]

Prediction(uid=1, iid=318, r_ui=3.529119856267723, est=5, details={'was_impossible': False})

In [19]:
from collections import defaultdict
def get_top_n(predictions,n):
    top_n = defaultdict(list)
    for uid,iid,_,est,_ in predictions:
        top_n[uid].append((iid,est))
    for uid,user_ratings in top_n.items():
        user_ratings.sort(key=lambda s:s[1],reverse = True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [20]:
top_n = get_top_n(predictions,n=3)

In [21]:
for uid,user_ratings in top_n.items():
    print(uid,[iid for (iid,ratings) in user_ratings])

1 [318, 904, 912]
2 [1210, 1204, 750]
3 [1262, 2858, 4967]
4 [5014, 1246, 1214]
5 [6874, 1732, 4973]
6 [904, 1283, 1246]
7 [1198, 7099, 3435]
8 [2324, 4226, 904]
9 [260, 3275, 1291]
10 [1292, 1267, 150]
11 [7361, 912, 260]
12 [47, 50, 593]
13 [260, 858, 1223]
14 [4226, 58559, 56782]
15 [1223, 1089, 1262]
16 [2160, 905, 1228]
17 [898, 4973, 1136]
18 [1204, 1235, 1288]
19 [1208, 27156, 1199]
20 [457, 1198, 1262]
21 [67255, 3147, 5008]
22 [750, 2324, 898]
23 [142488, 58559, 899]
24 [56782, 1136, 78499]
25 [356, 593, 923]
26 [1387, 318, 1196]
27 [318, 527, 296]
28 [951, 1193, 1394]
29 [246, 541, 4993]
30 [1089, 1258, 898]
31 [48516, 1104, 1673]
32 [2571, 3266, 904]
33 [2858, 4973, 904]
34 [1214, 27773, 1204]
35 [296, 3451, 318]
36 [1197, 2571, 7361]
37 [1237, 930, 1225]
38 [260, 1196, 1283]
39 [4973, 318, 1283]
40 [2858, 27773, 858]
41 [306, 142488, 1237]
42 [1206, 1997, 750]
43 [260, 441, 923]
44 [356, 1206, 1089]
45 [1204, 56782, 750]
46 [527, 1089, 1136]
47 [3362, 899, 5952]
48 [750, 11