In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [10]:
import os
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from surprise import Dataset, Reader
from surprise import KNNBasic, BaselineOnly
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from collections import defaultdict

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls
# configure file path
data_path = 'gdrive/MyDrive/movielens1m/'
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'
# read data
df_movies = pd.read_csv(
    data_path + movies_filename,
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

df_ratings = pd.read_csv(
    (data_path + ratings_filename),
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

# pivot ratings into movie features
df_movie_features = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)



df_movies_cnt = df_ratings.groupby('movieId') \
       .agg({'userId':'size', 'rating':'mean'}) \
       .rename(columns={'userId':'count','rating':'average'}) \
       .reset_index()

print(df_ratings)
print(df_movies_cnt)
movie_rating_thres = 10

df_movies_cnt = df_movies_cnt.sort_values(by=['count']) 
popular_movies = df_movies_cnt.head(movie_rating_thres)
print(popular_movies)

movie_list = popular_movies['movieId'].tolist()
print(movie_list)


df_users_cnt = df_ratings.groupby('userId') \
       .agg({'movieId':'size', 'rating':'mean'}) \
       .rename(columns={'movieId':'count','rating':'average'}) \
       .reset_index()

print(df_users_cnt)
user_rating_thres = 30

users_below_threshold = df_users_cnt[(df_users_cnt['count'] < user_rating_thres)]  
print("ans=", len(users_below_threshold))
print(users_below_threshold)

user_list = users_below_threshold['userId'].tolist()
print(user_list)

print(df_movie_features)

for u in user_list:
	for i in movie_list:
		if df_ratings[(df_ratings.userId == u) & (df_ratings.movieId == i)].empty:
			r = popular_movies.loc[popular_movies.movieId == i, 'average'].iloc[0]
			new_row = pd.Series({'userId': u, 'movieId': i, 'rating': r})
			df_ratings = df_ratings.append(new_row, ignore_index=True)

print(df_ratings)
# users_filter = df_ratings.userId.isin(users_below_threshold).values

reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)

sim_options = {'name': 'cosine',
               'user_based': True  # compute  similarities between users
               }
algo = KNNBasic(sim_options=sim_options)

cross_validate(algo, data, measures=['MAE','MSE','RMSE'], cv=5, verbose=True)

kf = KFold(n_splits=5)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print('Precision = ', sum(prec for prec in precisions.values()) / len(precisions))
    print('Recall = ', sum(rec for rec in recalls.values()) / len(recalls))

# predicting ratings
# error = 0
# for i in mat_movie_features_test:
#     print(i)
#     distances, indices = model_knn.kneighbors(i,n_neighbors=6)
#     print(indices)
#     val = 0
#     for j in indices:
#         print(mat_movie_features[j])
#         val += mat_movie_features[j]
#     val /= 6
#     error += (val - i[1])*(val - i[1])





         userId  movieId  rating
0             1        2     3.5
1             1       29     3.5
2             1       32     3.5
3             1       47     3.5
4             1       50     3.5
...         ...      ...     ...
1048570    7120      168     5.0
1048571    7120      253     4.0
1048572    7120      260     5.0
1048573    7120      261     4.0
1048574    7120      266     3.5

[1048575 rows x 3 columns]
       movieId  count   average
0            1   2569  3.959323
1            2   1155  3.268398
2            3    685  3.186861
3            4    138  3.000000
4            5    657  3.143836
...        ...    ...       ...
14021   130073      1  2.500000
14022   130219      1  4.500000
14023   130462      1  4.000000
14024   130490      2  2.250000
14025   130642      1  3.000000

[14026 rows x 3 columns]
       movieId  count  average
14025   130642      1      3.0
12141    85312      1      2.5
12140    85307      1      3.5
12139    85305      1      0.5
5259      5

In [3]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 298kB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1617575 sha256=2b9506cb8a0f22c74d5357faf0f592fca5ae5fe08069113e9baf506ebe46f4df
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1
