In [None]:
import os
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import Counter
from surprise import SVD
from surprise import KNNBasic
from surprise import CoClustering
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error as mse
from multiprocessing import Pool

In [None]:
movies = pd.read_csv('data/25m/movies.csv', usecols = [0, 2], names = ['movieId', 'genres'])
ratings = pd.read_csv('data/25m/ratings.csv', usecols = [0, 1, 2], names = ['userId', 'movieId', 'rating'])

In [None]:
genres_list = []
for i in movies['genres']:
    genres_list.extend(i.split('|'))
genres = list(set(genres_list))
genres

In [None]:
dict_mov = Counter(genres_list)
val_mov = np.array(list(dict_mov.values()))
val_mov = val_mov / val_mov.sum()
movie_distr = dict(zip(list(dict_mov.keys()), val_mov))

In [None]:
x = np.arange(len(genres))
width = 0.5

fig, ax = plt.subplots(figsize=(22,8))
rects1 = ax.bar(x, movie_distr.values(), width, label='True')

ax.set_ylabel('Вероятность', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(movie_distr.keys(), fontsize=14)
ax.legend(fontsize=14)
fig.tight_layout()

plt.show()

In [None]:
col_df = ['userId']
col_df.extend(genres)

In [None]:
users = list(set(ratings['userId']))

In [None]:
reader = Reader(line_format='user item rating')

In [None]:
data = Dataset.load_from_df(ratings, reader=reader)

In [None]:
# data = Dataset.load_builtin('ml-20m')

In [None]:
algo = KNNBasic()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True, n_jobs=4)

In [None]:
trainset, testset = train_test_split(data, test_size=.3)

In [None]:
df = shuffle(ratings)
trainset = df[:3500000].reset_index().drop(['index'], axis=1)
testset = df[3500000:].reset_index().drop(['index'], axis=1)

In [None]:
reader = Reader(rating_scale=(0, 5.0))

data = Dataset.load_from_df(trainset[['userId', 'movieId', 'rating']], reader)

In [None]:
train = data.build_full_trainset()
test = testset.values

In [None]:
algo = NormalPredictor()

algo.fit(train)
predictions = algo.test(test)

In [None]:
predictions = pd.read_csv('data/predictions.csv')

In [None]:
usersId = predictions['uid'].values
usersId = list(set(usersId))
pred = pd.DataFrame(predictions)

In [None]:
trainset.to_csv('data/trainset.csv', index=None)

In [None]:
trainset = pd.read_csv('data/trainset.csv')

In [None]:
rec = pred.loc[pred['est']>=3.5]

In [None]:
dist = trainset.loc[trainset['rating'] >= 3.5] #pred.loc[pred['r_ui']>3.5]

In [None]:
rec

In [None]:
dist

In [None]:
rec['iid'].values

In [None]:
col_df

In [None]:
us = [21 * [0]] * len(usersId)
t_us = [21 * [0]] * len(set(dist['userId']))
t_dist = pd.DataFrame(data=t_us, columns=col_df)
p_dist = pd.DataFrame(data=us, columns=col_df)

In [None]:
p_dist['userId'] = usersId
t_dist['userId'] = list(set(dist['userId'].values))

In [None]:
gen_df_rec = pd.DataFrame(data=[[0]*20], columns=genres)
gen_df_true = pd.DataFrame(data=[[0]*20], columns=genres)

In [None]:
gen_df_rec = {x: 0 for x in genres}

In [None]:
for i in tqdm(rec['iid'].values):
    gen = movies.loc[movies['movieId']==int(i), 'genres'].values[0].split('|')
    for k in gen:
        gen_df_rec[k] += 1

In [None]:
gen_df_true = {x: 0 for x in genres}

In [None]:
for i in tqdm(dist['movieId'].values):
    gen = movies.loc[movies['movieId']==int(i), 'genres'].values[0].split('|')
    for k in gen:
        gen_df_true[k] += 1

In [None]:
dist_rec = np.array(list(gen_df_rec.values())) / sum(list(gen_df_rec.values()))

In [None]:
gen_df_rec

In [None]:
gen_df_true

In [None]:
dist_rec

In [None]:
dist_true = np.array(list(gen_df_true.values())) / sum(list(gen_df_true.values()))

In [None]:
dist_true

In [None]:
predictions = pd.DataFrame(predictions)
predictions.drop(['details'], inplace=True, axis=1)

In [None]:
pd.DataFrame(predictions).to_csv('data/predictions.csv', index=None)

In [None]:
predictions = pd.read_csv('data/predictions.csv')

In [None]:
predictions

In [None]:
rat_train = trainset.loc[trainset['rating'] > 3.5]

In [None]:
trainUsersId = list(set(rat_train['userId'].values))

In [None]:
trainMovieId = list(set(trainset['movieId'].values))

In [None]:
us = [21 * [0]] * len(trainUsersId)
train_dist = pd.DataFrame(data=us, columns=col_df)
train_dist['userId'] = trainUsersId

In [None]:
for i in tqdm(trainUsersId):
    movId = trainset.loc[trainset['userId']==i, 'movieId'].values
    for j in movId:
        gen = movies.loc[movies['movieId']==j, 'genres'].values
        if gen.size == 0:
            continue
        gen = gen[0].split('|')
        for k in gen:
            train_dist.loc[train_dist['userId']==i, k] += 1

In [None]:
tr_dist = []
def user_genre_dist(i):
    temp = [i]
    gen_col = dict(zip(genres, len(genres) * [0]))
    movId = trainset.loc[trainset['userId']==i, 'movieId'].values
    for j in movId:
        gen = movies.loc[movies['movieId']==j, 'genres'].values
        if gen.size == 0:
            continue
        gen = gen[0].split('|')
        for k in gen:
            gen_col[k] += 1
    temp.extend(list(gen_col.values()))
    return temp
        
    
with Pool(7) as p:
    tr_dist = p.map(user_genre_dist, trainUsersId)

In [None]:
train_dist = pd.DataFrame(data=tr_dist, columns=col_df)

In [None]:
train_dist

In [None]:
predictions

In [None]:
rat_pred = predictions.loc[predictions['est'] > 3.5]
predUsersId = list(set(rat_pred['uid'].values))

In [None]:
pr_dist = []
def user_genre_dist(i):
    temp = [i]
    gen_col = dict(zip(genres, len(genres) * [0]))
    movId = predictions.loc[predictions['uid']==i, 'iid'].values
    for j in movId:
        gen = movies.loc[movies['movieId']==j, 'genres'].values
        if gen.size == 0:
            continue
        gen = gen[0].split('|')
        for k in gen:
            gen_col[k] += 1
    temp.extend(list(gen_col.values()))
    return temp
        
    
with Pool(7) as p:
    pr_dist = p.map(user_genre_dist, predUsersId)

In [None]:
pred_dist = pd.DataFrame(data=pr_dist, columns=col_df)

In [None]:
pred_dist

In [None]:
temp = []

In [None]:
for i in pred_dist[genres].values:
    temp.append(i/sum(i))

In [None]:
pred_dist[genres] = temp
pred_dist

In [None]:
temp = []

In [None]:
for i in train_dist[genres].values:
    temp.append(i/sum(i))

In [None]:
train_dist[genres] = temp
train_dist

In [None]:
train_dist.to_csv('data/train_dist.csv', index=None)
pred_dist.to_csv('data/pred_dist.csv', index=None)

In [None]:
train_dist = pd.read_csv('data/train_dist.csv')
pred_dist = pd.read_csv('data/pred_dist.csv')

In [None]:
cal = []
lambd = 0.5
alpha = 0.001
def CL_div(i):
    genr = movies.loc[movies['movieId']==int(i[1]), 'genres'].values[0].split('|')
    CL = 0
    for g in genr:
        p = train_dist.loc[train_dist['userId']==int(i[0]), g].values
        if p.size == 0:
            p = 0
        else:
            p = p[0]
        q = pred_dist.loc[pred_dist['userId']==int(i[0]), g].values
        if q.size == 0:
            q = 0
        else:
            q = q[0]
        q = (1-alpha)*q+alpha*p
        if np.isnan(p) or int(p)==0:
            CL = q
            continue
        CL += p*log(p/q)
    return abs(1.0*i[3]-lambd*CL), CL

with Pool(7) as po:
    cal = po.map(CL_div, tqdm(predictions.values))

In [None]:
cal = []
lambd = 0.5
alpha = 0.001
def CL_div(i):
    genr = movies.loc[movies['movieId']==int(i[1]), 'genres'].values[0].split('|')
    g = genr[0]
    p = train_dist.loc[train_dist['userId']==int(i[0]), g].values
    if p.size == 0:
        p = 0
    else:
        p = p[0]
    q = pred_dist.loc[pred_dist['userId']==int(i[0]), g].values
    if q.size == 0:
        q = 0
    else:
        q = q[0]
    q = (1-alpha)*q+alpha*p
    if np.isnan(p) or int(p)==0:
        CL = q
    else:
        CL = p*log(p/q)
    return abs((lambd*i[3]+lambd*CL), CL

with Pool(7) as po:
    cal = po.map(CL_div, tqdm(predictions.values))

In [None]:
CL = []
y_cal = []
for i in cal:
    CL.append(i[1])
    y_cal.append(i[0])

In [None]:
y_cal = []
for i in lines:
    y_cal.append(float(i))

In [None]:
np.array(CL).mean()

In [None]:
y_pred = predictions['est'].values
y_true = predictions['r_ui'].values

In [None]:
mse(y_true, y_pred), mse(y_true, y_cal)

In [None]:
mse(y_true, y_pred, squared=False), mse(y_true, y_cal, squared=False)

In [None]:
predictions['cal'] = y_cal

In [None]:
predictions

In [None]:
rat_cal = predictions.loc[predictions['cal'] > 3.5]
calUsersId = list(set(rat_cal['uid'].values))

In [None]:
cal_dist = []
def user_genre_dist(i):
    temp = [i]
    gen_col = dict(zip(genres, len(genres) * [0]))
    movId = predictions.loc[predictions['uid']==i, 'iid'].values
    for j in movId:
        gen = movies.loc[movies['movieId']==j, 'genres'].values
        if gen.size == 0:
            continue
        gen = gen[0].split('|')
        for k in gen:
            gen_col[k] += 1
    temp.extend(list(gen_col.values()))
    return temp
        
    
with Pool(7) as p:
    cal_dist = p.map(user_genre_dist, calUsersId)

In [None]:
cal_dist = pd.DataFrame(data=cal_dist, columns=col_df)

In [None]:
cal_dist

In [None]:
temp = []

In [None]:
for i in cal_dist[genres].values:
    temp.append(i/sum(i))

In [None]:
cal_dist[genres] = temp
cal_dist

In [None]:
cal = []
alpha = 0.001
def CL_div(i):
    genr = movies.loc[movies['movieId']==int(i[1]), 'genres'].values[0].split('|')
    CL = 0
    for g in genr:
        p = train_dist.loc[train_dist['userId']==int(i[0]), g].values
        if p.size == 0:
            p = 0
        else:
            p = p[0]
        q = cal_dist.loc[cal_dist['userId']==int(i[0]), g].values
        if q.size == 0:
            q = 0
        else:
            q = q[0]
        if np.isnan(p):
            CL = q
            continue
        CL += (sqrt(p)-sqrt(q))**2
    return 1/sqrt(2)*sqrt(CL)

with Pool(7) as po:
    CL = po.map(CL_div, predictions.values)

In [None]:
cal = []
lambd = 0.5
alpha = 0.001
def CL_div(i):
    genr = movies.loc[movies['movieId']==int(i[1]), 'genres'].values[0].split('|')
    g = genr[0]
    p = train_dist.loc[train_dist['userId']==int(i[0]), g].values
    if p.size == 0:
        p = 0
    else:
        p = p[0]
    q = cal_dist.loc[cal_dist['userId']==int(i[0]), g].values
    if q.size == 0:
        q = 0
    else:
        q = q[0]
    q = (1-alpha)*q+alpha*p
    if np.isnan(p) or int(p)==0:
        CL = q
    else:
        CL = p*log(p/q)
    return CL

with Pool(7) as po:
    cal = po.map(CL_div, tqdm(predictions.values))

In [None]:
np.array(cal).mean()

In [None]:
us = [21 * [0]] * len(calUsersId)
c_dist = pd.DataFrame(data=us, columns=col_df)

In [None]:
c_dist['userId'] = calUsersId

In [None]:
gen_df_cal = pd.DataFrame(data=[[0]*20], columns=genres)

In [None]:
gen_df_cal = {x: 0 for x in genres}

In [None]:
for i in tqdm(rat_cal['iid'].values):
    gen = movies.loc[movies['movieId']==int(i), 'genres'].values[0].split('|')
    for k in gen:
        gen_df_cal[k] += 1

In [None]:
dist_cal = np.array(list(gen_df_cal.values())) / sum(list(gen_df_cal.values()))

In [None]:
predictions

In [None]:
x = np.arange(len(genres))
width = 0.3

fig, ax = plt.subplots(figsize=(22,12))
rects1 = ax.bar(x - 3*width/3, dist_true, width, label='True')
rects2 = ax.bar(x, dist_rec, width, label='Predicted')
rects3 = ax.bar(x + 3*width/3, dist_cal, width, label='Calibrated')

ax.set_ylabel('Вероятность', fontsize=20)
ax.tick_params(axis="y", labelsize=20)
ax.set_xticks(x)
ax.set_xticklabels(genres, fontsize=20, rotation=90)
ax.legend(fontsize=20)
fig.tight_layout()

plt.show()

In [None]:
tp = 0
fp = 0
fn = 0
for i in predictions[['r_ui', 'est']].values:
    if i[0] >= 4 and i[1] >= 4:
        tp += 1
    elif i[0] < 4 and i[1] >= 4:
        fp += 1
    elif i[0] >=4 and i[1] < 4:
        fn += 1

In [None]:
prec = tp / (tp+fp)
rec = tp / (tp+fn)
prec, rec