In [4]:
# !pip install --upgrade pip --quiet
# !pip install numpy --quiet
# !pip install pandas --quiet
# !pip install matplotlib --quiet
# !pip install scipy --quiet
# !pip install scikit-learn --quiet
# !pip install torch --quiet

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as sparse
from sklearn.model_selection import train_test_split
import multiprocessing
from multiprocessing import Pool
from joblib import Parallel, delayed
from ast import literal_eval
import heapq
import timeit
import time
import torch

In [5]:
# !wget -nc https://files.grouplens.org/datasets/movielens/ml-20m.zip
# !unzip -n ml-20m.zip

In [10]:
df = pd.read_csv('../ml-20m/ratings.csv')
df.drop(columns=['timestamp'],inplace=True)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=100)
df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [7]:
df_user = df_train.groupby('userId').agg(
    user_count = ('rating', 'count'),
    user_mean = ('rating', 'mean')
).reset_index()
df_item = df_train.groupby('movieId').agg(
    item_count = ('rating', 'count'),
    item_mean = ('rating', 'mean')
).reset_index()

In [8]:
df_regular_user = df_user[df_user['user_count']>=50].reset_index(drop=True)
df_common_movie = df_item[df_item['item_count']>=100].reset_index(drop=True)
user_dict = dict(zip(df_regular_user.userId, df_regular_user.index))
user_list = list(df_regular_user['userId'])
user_bias = list(df_regular_user['user_mean'])
movie_dict = dict(zip(df_common_movie.movieId, df_common_movie.index))
movie_list = list(df_common_movie['movieId'])
n_user, n_movie = len(user_list), len(movie_list)

In [9]:
df_train = pd.merge(df_train, df_common_movie, how='inner', on='movieId')
df_train = pd.merge(df_train, df_regular_user, how='inner', on='userId')
df_train['rating'] = df_train['rating']-df_train['user_mean']
df_train['user_index'] = df_train['userId'].map(user_dict).astype('int64')
df_train['movie_index'] = df_train['movieId'].map(movie_dict).astype('int64')

In [10]:
data = sparse.csr_matrix((df_train.rating, (df_train.user_index, df_train.movie_index)), shape=(n_user, n_movie))
rating_dict = dict(zip(list(zip(df_train.user_index, df_train.movie_index)), df_train.rating))

In [11]:
n_user, n_movie

(73589, 7983)

In [86]:
class topk_heap:
    def __init__(self, size):
        self.size = size
        self.container = []
    def add(self, x):
        if len(self.container) < self.size:
            heapq.heappush(self.container, x)
        elif x > self.container[0]:
            heapq.heappushpop(self.container, x)
    def get_users(self):
        return [(x[2],x[0]*x[1]) for x in sorted(self.container)]
    def get_movies(self):
        return [x[1] for x in sorted(self.container, reverse=True)]

In [64]:
def get_similar_users(u):
    obj = topk_heap(limit)
    user_row = data.getrow(u).indices
    user_val = data.getrow(u).data
    user_nnz = data.getrow(u).nnz
    user_arr = np.arange(n_user)
    np.random.shuffle(user_arr)
    for v in user_arr[0:2500]:
        if u==v:
            continue
        row = data.getrow(v).indices
        val = data.getrow(v).data
        nnz = data.getrow(v).nnz
        i, j = 0, 0
        a2, b2, ab, c = 0, 0, 0, 0
        while i<user_nnz and j<nnz:
            if user_row[i]==row[j]:
                a, b = user_val[i], val[j]
                a2=a2+(a*a); b2=b2+(b*b); ab=ab+(a*b); c=c+1;
                i=i+1; j=j+1;
            elif user_row[i]<row[j]:
                i=i+1;
            elif user_row[i]>row[j]:
                j=j+1;
        if c>=10 and a2>0 and b2>0:
            x = ab/np.sqrt(a2*b2)
            s = 1 if x>=0 else -1
            obj.add((abs(x), s, v))
    return (u,obj.get_users())

In [38]:
%%time
limit = 25
n = 1
for u in range(n):
    get_similar_users(u)

CPU times: user 643 ms, sys: 12.2 ms, total: 655 ms
Wall time: 642 ms


In [49]:
%%time
limit = 25
n = n_user
if __name__ == '__main__':
    with Pool() as pool:
        similar_user = dict(pool.imap_unordered(get_similar_users, range(n)))

CPU times: user 17.8 s, sys: 4.2 s, total: 22 s
Wall time: 1h 38min 23s


In [65]:
df_similar_user = pd.DataFrame(list(similar_user.items()), columns=['user', 'similar_user'])
df_similar_user = df_similar_user.sort_values('user').reset_index(drop=True)
df_similar_user.to_csv('similar_user.csv', index=False)

In [30]:
df_similar_user = pd.read_csv('similar_user.csv')
df_similar_user['similar_user'] = df_similar_user['similar_user'].apply(literal_eval)
similar_user = df_similar_user.iloc[:,1].values

In [29]:
rating_dict[(0,47)]

-0.23880597014925353

In [47]:
user_bias[0]

3.7388059701492535

In [45]:
dict_user_bias = dict(zip(np.arange(n_user), user_bias))

In [46]:
dict_user_bias[0]

3.7388059701492535

In [None]:
%%time
for _ in range(1000000):
    x = np.random.randint(n_user)
    

In [112]:
def predict(u, m):#predict rating by user u to movie m
    num, den, cnt = 0, 0, 0
    for v, cor in similar_user[u]:
        try:
            num += cor*rating_dict[(v,m)]
            den += abs(cor); cnt += 1;
        except KeyError:
            continue
    if cnt<3:
        return np.nan
    return user_bias[u] + (num/den)

In [81]:
def cf_recommendation(u):
    obj = topk_heap(25)
    for m in range(n_movie):
        try:
            x = rating_dict[(u,m)]
        except KeyError:
            obj.add((predict(u,m), m))
    return obj.get_movies()

In [95]:
def raw_predict(u, m):
    try:
        u = user_dict[u]
    except :
        return np.nan
    try:
        m = movie_dict[m]
    except :
        return np.nan
    return predict(u,m)

In [117]:
raw_predict(1,0)

nan

In [119]:
%%time
df_train['pred_rating'] = df_train.apply(lambda x : raw_predict(x['userId'], x['movieId']), axis=1)

CPU times: user 4min 31s, sys: 1.51 s, total: 4min 32s
Wall time: 4min 32s
