## 0. Import packages

In [1]:
import pandas as pd
import gzip
import math
from tqdm import tqdm
from itertools import chain, combinations
from Association import Association
import numpy as np
import random
tqdm.pandas() #for progres_apply etc.

In [2]:
seed_index = 0

## 1. Load data from previous step

In [3]:
import pickle
rules = pickle.load(open(f"pickle_dumps/article_rules_{seed_index}.p", "rb"))
user_rules = pickle.load(open(f"pickle_dumps/user_rules_{seed_index}.p", "rb"))

In [4]:
df_test = pickle.load(open(f"pickle_dumps/test_df_{seed_index}.p", "rb"))
df_test = df_test.set_index("user_id").to_dict()["item_id"]

In [5]:
df_train = pickle.load(open(f"pickle_dumps/train_df_{seed_index}.p", "rb"))
df_train = df_train.set_index("user_id").to_dict()["item_id"]

In [6]:
class Recommender():
    def __init__(self, assos):
        pass
    
    def recommend(self, user_items, k, user_id=None, split=None):
        pass        

In [7]:
class Assos_Recommender(Recommender):
    def __init__(self, assos, sorter=lambda x: x.s * x.c):
        # use popular items to fill unused space
        # TODO (df_train is used for this)
        
        self.sorter = sorter
        self.assos = sorted(assos, key=self.sorter, reverse=True)[:100000]
        
#     def get_items(self, user_id):
#         return self.df_test[user_id]
    
    def recommend(self, user_items, k, user_id=None, split=None):
        recommendation = []
        user_items = set(user_items)
        for a in self.assos:
#             print(a.left, user_items)
            stop = False
            for l in a.left:
                # break if any element is not in the user session
                if l not in user_items:
                    stop = True
                    break
                    
                # all elements are in the user session --> add RHS of rule to the recommendation
            if not stop:
                r = a.right
                for item in r:
                    if item not in user_items:
                        if item not in recommendation:
                            recommendation.append(item)
                        if len(recommendation) >= k and k != -1:
                            break
                        
            if len(recommendation) >= k and k != -1:
                break
                
        return recommendation

In [48]:
class User_Recommender(Recommender):
    def __init__(self, assos, value=lambda x: x.s * x.c):
        self.assos = assos
        self.value = value
    
    def recommend(self, user_items, k, user_id):
        if len(user_items) <= 0 or user_id not in self.assos:
            return []
        
        user_items = set(user_items)
        rdict = dict()

        for a in self.assos[user_id]:
            item_list = list()
            for l in a.left:
                tmp = []
                tmp.extend(df_test[l])
                tmp.extend(df_train[l])
                item_list.append(set(tmp))

            if item_list:
                intersection = set.intersection(*item_list)
                tmp = intersection - user_items
                for item in tmp:
                    if item in rdict:
                        rdict[item] += self.value(a)
                    else:
                        rdict[item] = self.value(a)
            
        rlist = sorted(rdict.keys(), key=lambda x: rdict[x], reverse=True)

        recommendation = []
        for i in rlist:
            if i not in recommendation:
                recommendation.append(i)
            if len(recommendation) >= k and k != -1:
                break
        return recommendation

In [9]:
class Combo_Recommender(Recommender):
    def __init__(self, ar, ur):
        self.ar = ar
        self.ur = ur
        
    def recommend(self, user_items, k, user_id):
        recommendation = self.ur.recommend(user_items, np.ceil(k/3), user_id)
        tmp = list(user_items) + recommendation
        if len(recommendation) < k:
            recommendation += (self.ar.recommend(tmp, k-len(recommendation), user_id))
        return recommendation

In [10]:
def hitrate(rc, k, nr_users = float("1"), qualitative = False, df = df_test):
    hits = 0
    ndcg = []
    counts = 0
    avg_len = 0
    n_recall = 0
    preds = []
    
    
    
    user_ids = list(df.keys())
    random.seed(20)
    for user in tqdm(random.sample(user_ids, int(len(user_ids) * nr_users), )):
#     for user in tqdm(user_ids[: int(len(user_ids) * nr_users)]):
        user_target = df[user]
        user_items = df_train[user]
        
        r = rc.recommend(user_items, k, user)

        avg_len += len(r)

        correct = 0
        hit = False
        for item in user_target:
            if item in r:
                if not hit:
                    hits += 1
                    hit = True
                correct += 1
        n_recall += correct / max(1, (min(k, len(user_target))))

        dcg = 0
        idcg = sum(1/np.log2(i+1) for i in range(1,max(1, min(len(user_target), k))+1))
        for i, rec in enumerate(r):
            good = 1 if rec in user_target else 0
            dcg += (2**good - 1)/(np.log2(i+2))
        ndcg.append(dcg/idcg)
            
        if qualitative:      
            preds.append((user_items, user_target, r))
        counts += 1
    
    print("avg len: {:f}".format(avg_len / counts))
    if k == -1:
        print("HR\t{:.5f}".format((hits / counts)))
        print("EHR\t{:.5f}".format((hits / counts) * (avg_len /  counts)))
    else:
        print("HR@{}\t{:.5f}".format(k, (hits / counts)))
        print("normalised recall@{}\t{:.5f}".format(k, (n_recall / counts)))
        print("EHR@{}\t{:.5f}".format(k, (hits / counts) * (avg_len /  counts)))
    print("nDCG@{}\t{:.5f}".format(k, (sum(ndcg) / len(ndcg))))
    
    if qualitative:
        return {"HR": hits /counts,
               "EHR": (hits / counts) * (avg_len /  counts),
               "normalised recall": (n_recall / counts),
               "nDCG": sum(ndcg) / len(ndcg),
               "predictions": preds}
    else:
        return {"HR": hits /counts,
               "EHR": (hits / counts) * (avg_len /  counts),
               "normalised recall": (n_recall / counts),
               "nDCG": sum(ndcg) / len(ndcg)}

# Results
rule.s --> 18.317 <br>
rule.s * rule.c --> 20.545

In [109]:
results = []
partition_size = 0.1
k = 20

In [110]:
ar = Assos_Recommender(rules, lambda x: x.s * x.c)
results.append(("lambda x: x.s * x.c", hitrate(ar, k, partition_size)))

100%|██████████| 5419/5419 [00:22<00:00, 243.62it/s]

avg len: 19.201329
HR@20	0.53202
normalised recall@20	0.22718
EHR@20	10.21543
nDCG@20	0.16753





In [111]:
ar = Assos_Recommender(rules, lambda x: x.s + x.c)
results.append(("lambda x: x.s + x.c", hitrate(ar, k, partition_size)))

100%|██████████| 5419/5419 [01:43<00:00, 52.13it/s]

avg len: 15.762687
HR@20	0.52224
normalised recall@20	0.22120
EHR@20	8.23185
nDCG@20	0.18971





In [112]:
ar = Assos_Recommender(rules, lambda x: x.c)
results.append(("lambda x: x.c", hitrate(ar, k, partition_size)))

100%|██████████| 5419/5419 [01:43<00:00, 52.11it/s]

avg len: 15.584610
HR@20	0.51965
normalised recall@20	0.21912
EHR@20	8.09859
nDCG@20	0.18832





In [113]:
ar = Assos_Recommender(rules, lambda x: 2 * x.s + x.c)
results.append(("lambda x: 2 * x.s + x.c", hitrate(ar, k, partition_size)))

100%|██████████| 5419/5419 [01:43<00:00, 52.35it/s]

avg len: 15.924709
HR@20	0.52242
normalised recall@20	0.22203
EHR@20	8.31940
nDCG@20	0.18968





In [114]:
ar = Assos_Recommender(rules, lambda x: np.sqrt(np.power(x.s, 2) + np.power(x.c, 2)))
results.append(("lambda x: np.sqrt(np.power(x.s, 2) + np.power(x.c, 2))", hitrate(ar, k, partition_size)))

100%|██████████| 5419/5419 [01:44<00:00, 51.90it/s]

avg len: 15.595497
HR@20	0.52021
normalised recall@20	0.21958
EHR@20	8.11288
nDCG@20	0.18871





In [115]:
ar = Assos_Recommender(rules, lambda x: np.sqrt(2 * np.power(x.s, 2) + np.power(x.c, 2)))
results.append(("lambda x: np.sqrt(2 * np.power(x.s, 2) + np.power(x.c, 2))", hitrate(ar, k, partition_size)))

100%|██████████| 5419/5419 [01:44<00:00, 51.70it/s]

avg len: 15.603617
HR@20	0.52021
normalised recall@20	0.21958
EHR@20	8.11711
nDCG@20	0.18871





In [116]:
ar = Assos_Recommender(rules, lambda x: x.lift * x.s * x.c)
results.append(("lambda x: x.lift * x.s * x.c", hitrate(ar, k, partition_size)))

100%|██████████| 5419/5419 [01:30<00:00, 59.81it/s]

avg len: 18.689426
HR@20	0.53183
normalised recall@20	0.23560
EHR@20	9.93964
nDCG@20	0.18658





In [117]:
ar = Assos_Recommender(rules, lambda x: x.z * x.s)
results.append(("lambda x: x.z * x.s", hitrate(ar, k, partition_size)))

100%|██████████| 5419/5419 [00:51<00:00, 105.67it/s]

avg len: 19.155564
HR@20	0.51578
normalised recall@20	0.22719
EHR@20	9.88001
nDCG@20	0.17583





In [118]:
ar = Assos_Recommender(rules, lambda x: x.z * x.s *x.c)
results.append(("lambda x: x.z * x.s * x.c", hitrate(ar, k, partition_size)))

100%|██████████| 5419/5419 [01:30<00:00, 59.75it/s]

avg len: 18.689426
HR@20	0.53183
normalised recall@20	0.23560
EHR@20	9.93964
nDCG@20	0.18658





In [119]:
ar = Assos_Recommender(rules, lambda x: x.z * x.s * x.s *x.c)
results.append(("lambda x: x.z * x.s * x.s * x.c", hitrate(ar, k, partition_size)))

100%|██████████| 5419/5419 [00:46<00:00, 115.66it/s]

avg len: 19.161284
HR@20	0.53811
normalised recall@20	0.23961
EHR@20	10.31081
nDCG@20	0.18122





In [120]:
last = 1
for i in sorted(results, key=lambda x: x[1]["normalised recall"], reverse=False):
    gain = i[1]["normalised recall"]/last - 1
    print(last, i[1]["normalised recall"])
    print(i[0])
    print("{:.2f}%".format(gain*100))
    print(i[1])
    print("-"*5, '\n')
    
    last = i[1]["normalised recall"]

1 0.21911639495784002
lambda x: x.c
-78.09%
{'HR': 0.5196530725226056, 'EHR': 8.098590318094042, 'normalised recall': 0.21911639495784002, 'nDCG': 0.18831996732359677}
----- 

0.21911639495784002 0.21957773468841763
lambda x: np.sqrt(np.power(x.s, 2) + np.power(x.c, 2))
0.21%
{'HR': 0.5202066801992987, 'EHR': 8.112881889094506, 'normalised recall': 0.21957773468841763, 'nDCG': 0.18871293410279336}
----- 

0.21957773468841763 0.21957773468841763
lambda x: np.sqrt(2 * np.power(x.s, 2) + np.power(x.c, 2))
0.00%
{'HR': 0.5202066801992987, 'EHR': 8.117105748465013, 'normalised recall': 0.21957773468841763, 'nDCG': 0.18870843557582107}
----- 

0.21957773468841763 0.22120075398007014
lambda x: x.s + x.c
0.74%
{'HR': 0.5222365750138402, 'EHR': 8.231851589690388, 'normalised recall': 0.22120075398007014, 'nDCG': 0.18971087070197695}
----- 

0.22120075398007014 0.22202755357946333
lambda x: 2 * x.s + x.c
0.37%
{'HR': 0.5224211109060712, 'EHR': 8.319404352602016, 'normalised recall': 0.2220275535

In [121]:
for i in results:
    print(f"{i[1]['HR']}".replace(".", ","))

print("-"*5)
for i in results:
    print(f"{i[1]['normalised recall']}".replace(".", ","))

print("-"*5)
for i in results:
    print(f"{i[1]['nDCG']}".replace(".", ","))

0,5320169773020853
0,5222365750138402
0,5196530725226056
0,5224211109060712
0,5202066801992987
0,5202066801992987
0,5318324414098542
0,5157778187857538
0,5318324414098542
0,5381066617457095
-----
0,2271792571779517
0,22120075398007014
0,21911639495784002
0,22202755357946333
0,21957773468841763
0,21957773468841763
0,23560416531352313
0,2271903094697794
0,23560416531352313
0,2396081919142624
-----
0,16752891720657953
0,18971087070197695
0,18831996732359677
0,18968150070765238
0,18871293410279336
0,18870843557582107
0,18657894759132407
0,17583235294901658
0,18657894759132407
0,1812220256639104


# Results
sum(rule.s * rule.c) --> 23.020

In [122]:
ur = User_Recommender(user_rules)
hr = hitrate(ur, k, partition_size)

100%|██████████| 5419/5419 [01:21<00:00, 66.75it/s]

avg len: 19.966784
HR@20	0.23325
normalised recall@20	0.06476
EHR@20	4.65732
nDCG@20	0.03922





no filter: 36139; 08554 <br>
25: idem<br>

In [123]:
score_function = lambda x: 2 * x.s + x.c

ar = Assos_Recommender(rules, score_function)
ur = User_Recommender(user_rules)
cr = Combo_Recommender(ar, ur)
hr = hitrate(cr, k, partition_size)

100%|██████████| 5419/5419 [02:06<00:00, 42.98it/s]

avg len: 19.961063
HR@20	0.43753
normalised recall@20	0.17079
EHR@20	8.73366
nDCG@20	0.08909





In [124]:
df_test2 = pickle.load(open(f"pickle_dumps/test_df_{seed_index}.p", "rb"))
df_test2 = df_test2.set_index("user_id").to_dict()["item_id"]

In [125]:
df_test2 = { key: df_test2[key] for key in [69, 420, 42069] }

In [126]:
results2 = hitrate(ar, k, 1, True, df_test2)

100%|██████████| 3/3 [00:00<00:00, 78.95it/s]

avg len: 17.333333
HR@20	0.66667
normalised recall@20	0.15000
EHR@20	11.55556
nDCG@20	0.17391





In [127]:
for prediction in results2["predictions"]:
    items = prediction[0]
    target = prediction[1]
    recommended = prediction[2]
#     print(items)
    print(target)
    print(recommended)
    print("-"*5)

[329]
[4, 7, 18, 9, 33, 14, 65, 3, 10, 5, 37, 67]
-----
[2717 3346 2181 2268  139  353 1263 3556 1788 3058  834  181  385  583
 4590  681  775 2425 2139 3138   89  183 1966 2675 3568 1657 3342 4260
 3405  687  219  508 1617  403  578 4057 2352 3500 4989  997 1618 2800
 2193 2231 1551  195 3994  980 1233 5839 6169 6354   60  924    1 4490
 4222 2365  406   24 1719 4552 1580  732  177  853   98 1802  693  305
 3046 1904 1465  226  452  445 2570  927  107 1164  878  379  281 2203
  322 3848 3066  928  918  193  331  413  234  893  200]
[1, 149, 98, 910, 406, 246, 922, 939, 608, 115, 187, 117, 195, 86, 14, 305, 924, 118, 415, 92]
-----
[1191 1696  829  516 1335 1497  421  730  507   59  889 1553  824 1748
 1939 3929 1635 2248  121 2714  366 1831 3729 3642 1718 3612  482  185
    7 1029 3437  731 1333  142  708  883  464 1397 1947 1268 1522 2399
 1981  787 3129 3412  837  261   22]
[28, 1, 121, 416, 279, 64, 243, 284, 661, 7, 269, 377, 282, 283, 421, 455, 440, 556, 232, 406]
-----
