In [1]:
from os.path import (
    abspath,
    dirname,
    exists,
    join,
)
from pandas import (
    read_csv,
    merge,
    concat,
    DataFrame,
    Series,
)
from random import randint
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_percentage_error,
    average_precision_score,
    f1_score,
)

In [2]:
PWD = (abspath(''))
datadir = join(PWD, '../Datasets/ml-1m/')
files = {}
files['train'] = join(datadir, 'train.csv')
files['test'] = join(datadir, 'test.csv')
THRESHOLD = 5
NEIGHBOR = 20
SIMILARITY = .7

In [3]:
dfs = {}
for file in files:
    dfs[file] = read_csv(
        files[file],
        engine='python',
        encoding='latin1',
    )
    dfs[file]['rating'] = dfs[file]['rating'].astype(int)

In [4]:
dfs['train'].head()

Unnamed: 0,title,rating,userId
0,Airplane! (1980),4,1
1,Aladdin (1992),4,1
2,Antz (1998),4,1
3,Apollo 13 (1995),5,1
4,Awakenings (1990),5,1


In [5]:
ratings = {}
users = dfs['train']['userId'].to_numpy()
users = set(users)
users = list(users)
for user in users:
    ratings[user] = {}

In [6]:
rows = dfs['train'].to_numpy()
for row in rows:
    ratings[row[2]][row[0]] = row[1]

In [7]:
dfs['test'].head()

Unnamed: 0,title,rating,userId
0,Snow White and the Seven Dwarfs (1937),4,1
1,"Sound of Music, The (1965)",5,1
2,Star Wars: Episode IV - A New Hope (1977),4,1
3,Tarzan (1999),3,1
4,Titanic (1997),4,1


In [8]:
dfs['train'].shape, dfs['test'].shape

((755865, 3), (99865, 3))

In [9]:
dfs['test'][dfs['test']['title'].isin(dfs['train']['title'])].shape

(99865, 3)

In [10]:
dfs['pivot'] = dfs['train'].pivot_table(
    index=['userId'],
    columns=['title'],
    values='rating',
)

In [11]:
dfs['pivot'].head()

title,"'burbs, The (1989)",10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),"13th Warrior, The (1999)",2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),2010 (1984),...,Year of Living Dangerously (1982),Yellow Submarine (1968),Yojimbo (1961),You've Got Mail (1998),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),Young Sherlock Holmes (1985),Zero Effect (1998),eXistenZ (1999)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,4.0,,,,,,,
3,,,,,,,,,,,...,,,,,,5.0,4.0,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [12]:
%%time
corr = dfs['pivot'].corr(min_periods=THRESHOLD)

Wall time: 8.21 s


In [13]:
corr.head()

title,"'burbs, The (1989)",10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),"13th Warrior, The (1999)",2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),2010 (1984),...,Year of Living Dangerously (1982),Yellow Submarine (1968),Yojimbo (1961),You've Got Mail (1998),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),Young Sherlock Holmes (1985),Zero Effect (1998),eXistenZ (1999)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",1.0,0.189689,0.088541,0.101064,0.04759,0.096866,0.295446,0.03752,0.165055,0.27459,...,0.455682,0.189847,-0.182108,0.298285,0.041385,0.3762887,0.456148,0.278425,0.039503,0.135172
10 Things I Hate About You (1999),0.189689,1.0,0.163892,0.372055,0.10173,0.266755,0.136631,0.167384,-0.068404,0.113104,...,-0.080678,0.052943,0.327377,0.194905,0.116762,0.3214718,0.200233,0.185754,0.199307,0.022365
101 Dalmatians (1961),0.088541,0.163892,1.0,0.479264,0.271421,0.11428,0.283266,0.457309,0.069166,0.190974,...,0.133182,-0.00458,0.305133,0.180285,0.249222,0.2860112,0.267038,0.013905,0.202831,-0.032436
101 Dalmatians (1996),0.101064,0.372055,0.479264,1.0,-0.129262,0.226558,0.083046,0.414381,0.080796,0.230015,...,-0.055453,0.010617,-0.013746,0.321915,0.137751,0.2299732,0.174124,0.059504,0.071884,0.096695
12 Angry Men (1957),0.04759,0.10173,0.271421,-0.129262,1.0,-0.22341,0.260387,0.341044,0.102686,0.061569,...,0.235069,0.074733,0.322539,0.131406,0.249671,-2.086715e-16,0.029403,0.281127,0.214276,0.318489


In [14]:
corr += 1.0
corr /= 2.0

In [15]:
corr.head()

title,"'burbs, The (1989)",10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),"13th Warrior, The (1999)",2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),2010 (1984),...,Year of Living Dangerously (1982),Yellow Submarine (1968),Yojimbo (1961),You've Got Mail (1998),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),Young Sherlock Holmes (1985),Zero Effect (1998),eXistenZ (1999)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",1.0,0.594845,0.544271,0.550532,0.523795,0.548433,0.647723,0.51876,0.582528,0.637295,...,0.727841,0.594924,0.408946,0.649142,0.520692,0.688144,0.728074,0.639212,0.519752,0.567586
10 Things I Hate About You (1999),0.594845,1.0,0.581946,0.686027,0.550865,0.633378,0.568315,0.583692,0.465798,0.556552,...,0.459661,0.526471,0.663689,0.597452,0.558381,0.660736,0.600116,0.592877,0.599654,0.511183
101 Dalmatians (1961),0.544271,0.581946,1.0,0.739632,0.635711,0.55714,0.641633,0.728655,0.534583,0.595487,...,0.566591,0.49771,0.652566,0.590143,0.624611,0.643006,0.633519,0.506952,0.601416,0.483782
101 Dalmatians (1996),0.550532,0.686027,0.739632,1.0,0.435369,0.613279,0.541523,0.70719,0.540398,0.615008,...,0.472273,0.505308,0.493127,0.660958,0.568876,0.614987,0.587062,0.529752,0.535942,0.548348
12 Angry Men (1957),0.523795,0.550865,0.635711,0.435369,1.0,0.388295,0.630194,0.670522,0.551343,0.530784,...,0.617534,0.537366,0.66127,0.565703,0.624835,0.5,0.514702,0.640564,0.607138,0.659245


In [16]:
rows = dfs['test'].to_numpy()

In [17]:
def get_harmonic(W, X):
    den = 0
    for (w, x) in zip(W, X):
        den += (w/x)
    num = sum(W)
    res = num/den
    
    return res

In [18]:
def get_true_positive(Y, k):
    return [1 if y >= k else 0 for y in Y]

In [19]:
def get_f1(y_true, y_pred, k=3):
    y_true = get_true_positive(y_true, k)
    y_pred = get_true_positive(y_pred, k)
    
    return f1_score(y_true, y_pred)

In [20]:
%%time
predictions = []
for row in rows[:]:
    u = row[2]
    m = row[0]
    r = row[1]
    movies_sim = corr[m].dropna()
    movies_sim = movies_sim[movies_sim.values >= SIMILARITY]
    tot = 0
    rating = 0
    W = []
    X = []
    #print(u, m, r)
    #print(movies_sim)
    for n, s in zip(movies_sim.index, movies_sim.values):
        rate = dfs['pivot'][n].loc[u]
        if rate != rate:
            continue
        tot += 1
        X.append(rate)
        W.append(s)
        if tot >= NEIGHBOR:
            break
    if tot > 0:
        rating = get_harmonic(W, X)
    else:
        rating = dfs['pivot'].loc[u].dropna()
        #print(rating)
        rating = sum(rating.values)/rating.shape[0]
    predictions.append(rating)

Wall time: 47.9 s


In [21]:
dfs['test']['prediction'] = predictions

In [22]:
mean_absolute_percentage_error(dfs['test']['rating'], dfs['test']['prediction'])

0.29596744418989407

In [23]:
get_f1(dfs['test']['rating'], dfs['test']['prediction'])

0.863791857383763