In [1]:
from os.path import (
    abspath,
    dirname,
    exists,
    join,
)
from pandas import (
    read_csv,
    merge,
    concat,
    DataFrame,
    Series,
)
from random import randint
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
PWD = (abspath(''))
datadir = join(PWD, '../Datasets/ml-1m/')
files = {}
files['train'] = join(datadir, 'train.csv')
files['test'] = join(datadir, 'test.csv')
THRESHOLD = 2
TAKE_TOP = 20

In [3]:
dfs = {}
for file in files:
    dfs[file] = read_csv(
        files[file],
        engine='python',
        encoding='latin1',
    )

In [4]:
dfs['train'].head()

Unnamed: 0,title,rating,userId
0,Airplane! (1980),4.0,1
1,Aladdin (1992),4.0,1
2,Antz (1998),4.0,1
3,Apollo 13 (1995),5.0,1
4,Awakenings (1990),5.0,1


In [5]:
ratings = {}
users = dfs['train']['userId'].to_numpy()
users = set(users)
users = list(users)
for user in users:
    ratings[user] = {}

In [6]:
rows = dfs['train'].to_numpy()
for row in rows:
    ratings[row[2]][row[0]] = row[1]

In [7]:
dfs['test'].head()

Unnamed: 0,title,rating,userId
0,Snow White and the Seven Dwarfs (1937),4.0,1
1,"Sound of Music, The (1965)",5.0,1
2,Star Wars: Episode IV - A New Hope (1977),4.0,1
3,Tarzan (1999),3.0,1
4,Titanic (1997),4.0,1


In [8]:
dfs['train'].shape, dfs['test'].shape

((755865, 3), (99865, 3))

In [9]:
dfs['test'][dfs['test']['title'].isin(dfs['train']['title'])].shape

(99865, 3)

In [10]:
dfs['users'] = dfs['train'].pivot_table(
    index=['title'],
    columns=['userId'],
    values='rating',
)

In [11]:
dfs['users'].head()

userId,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",,,,,,,,,,4.0,...,,,,,,,,,,
10 Things I Hate About You (1999),,,,,,,,,,,...,,,,,,2.0,,,,
101 Dalmatians (1961),,,,,,,,,,,...,,4.0,,,,4.0,,,,
101 Dalmatians (1996),,,,,,,,,,,...,,,,,1.0,,,,,
12 Angry Men (1957),,,,,,,,,,3.0,...,,,,,,,4.0,,,5.0


In [12]:
corr_user = dfs['users'].corr(min_periods=THRESHOLD)

In [13]:
corr_user += 1.0
corr_user /= 2.0

In [14]:
corr_user.head()

userId,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.7,0.238884,0.75,0.270584,0.590784,,0.166667,0.65667,0.424752,...,0.419936,0.908248,,,0.531009,0.426223,0.302358,,0.5508,0.527408
2,0.7,1.0,0.616775,0.295876,0.30215,0.550436,0.59759,0.524901,0.602029,0.450896,...,0.117647,0.550315,0.646647,,0.740294,0.667588,0.646267,,0.919263,0.515827
3,0.238884,0.616775,1.0,1.0,0.271782,0.328501,0.9375,0.237387,0.565653,0.510779,...,0.222222,0.403175,0.099108,,0.673226,0.610754,0.5,,0.543355,0.284105
4,0.75,0.295876,1.0,1.0,,,1.0,,0.75,0.311018,...,,0.5,0.336337,,0.75,0.790146,0.0,,,0.589803
5,0.270584,0.30215,0.271782,,1.0,0.210824,0.656348,0.741886,0.573793,0.464953,...,0.63261,0.853553,0.564957,0.75,0.758365,0.610301,0.445316,0.0,1.0,0.677139


In [15]:
users = corr_user.columns.tolist()

In [16]:
similarities = {}
for u in users:
    users_sim = corr_user[u].dropna()
    similarities[u] = {}
    for (v, s) in zip(users_sim.index, users_sim.values):
        similarities[u][v] = s

In [17]:
dfs['pivot'] = dfs['train'].pivot_table(
    index=['title'],
    columns=['userId'],
    values='rating',
)

In [18]:
%time
rows = dfs['test'].to_numpy()
predictions = []
for row in rows[:]:
    rating = 0
    u = row[2]
    m = row[0]
    r = row[1]
    tot = 0
    raters = dfs['pivot'].loc[m].dropna().index
    sim_tot = 0
    #print(raters)
    for v in raters:
        if v not in similarities[u]:
            continue
        if similarities[u][v] >= .7:
            rating += ratings[v][m]*similarities[u][v]
            sim_tot += similarities[u][v]
            tot += 1
            if tot > TAKE_TOP:
                break
    if tot > 0:
        rating /= sim_tot
        #rating *= 5
        rating += .5
        rating = int(rating)
        #rating /= 5
    else:
        rating = 3
    predictions.append(rating)

Wall time: 0 ns


In [19]:
dfs['test']['prediction'] = predictions

In [20]:
dfs['test'].head()

Unnamed: 0,title,rating,userId,prediction
0,Snow White and the Seven Dwarfs (1937),4.0,1,4
1,"Sound of Music, The (1965)",5.0,1,4
2,Star Wars: Episode IV - A New Hope (1977),4.0,1,5
3,Tarzan (1999),3.0,1,3
4,Titanic (1997),4.0,1,3


In [21]:
mean_absolute_percentage_error(dfs['test']['rating'], dfs['test']['prediction'])

0.30836445868589263

In [22]:
dfs['test']['prediction'].describe()

count    99865.000000
mean         3.610875
std          0.804491
min          1.000000
25%          3.000000
50%          4.000000
75%          4.000000
max          5.000000
Name: prediction, dtype: float64