## 0. Import packages

In [1]:
import pandas as pd
import gzip
import math
from tqdm import tqdm
from itertools import chain, combinations
from Association import Association
from math import sqrt
tqdm.pandas() #for progres_apply etc.

## 1. Load data from previous step

In [2]:
import pickle
train_df = pickle.load(open("pickle_dumps/train_df.p", "rb"))
article_train_df = pickle.load(open("pickle_dumps/article_train_df.p", "rb"))
article_test_df = pickle.load(open("pickle_dumps/article_test_df.p", "rb"))

In [3]:
train_df

Unnamed: 0,user_id,item_id,recommend
642076,642076,"[4706, 5624]","[True, True]"
703220,703220,"[5400, 7362]","[True, True]"
549051,549051,"[4105, 11396]","[True, True]"
106660,106660,"[577, 691, 3290, 5662, 6179, 8028, 9277, 10476...","[True, True, True, True, True, True, True, Tru..."
196155,196155,"[1215, 4579, 7964]","[True, True, True]"
...,...,...,...
1062408,1062408,"[9362, 9647]","[True, True]"
384467,384467,"[1622, 7052, 13997]","[True, True, True]"
261064,261064,"[1450, 5041, 10264]","[True, True, True]"
52587,52587,"[380, 7063]","[True, True]"


In [4]:
data = train_df.set_index("user_id").to_dict()["item_id"]
article_data = article_train_df.set_index("item_id").to_dict()["user_id"]

In [5]:
def eclat(prefix, minsup, items):
    frequents = []
    while items:
        i,itids = items.pop()
        isupp = len(itids)
        if isupp >= minsup:
            frequents.append((frozenset(prefix + [i]), isupp))
            suffix = [] 
            for j, ojtids in items:
                jtids = set(itids) & set(ojtids)
                if len(jtids) >= minsup:
                    suffix.append((j,jtids))
            frequents.extend(eclat(prefix+[i], minsup, sorted(suffix, key=lambda item: len(item[1]), reverse=True)))
    return frequents

In [6]:
# freq = sorted(frequent_itemsets, key=lambda x: 1000*len(x[0]) + x[1], reverse=True)

In [7]:
def subsets(itemset, include_empty_set=False):
    """ List all strict subsets of an itemset without the empty set or with the empty set if include_empty_set=True
        subsets({1,2,3}) --> [{1}, {2}, {3}, {1, 2}, {1, 3}, {2, 3}]
    """
    s = list(itemset)
    if include_empty_set:
        return map(set, chain.from_iterable(combinations(s, r) for r in range(0, len(s) + 1)))
    return map(set, chain.from_iterable(combinations(s, r) for r in range(1, len(s))))

In [8]:
def deriveRules(itemsets, minconf):
    """ Returns all rules with conf >= minconf that can be derived from the itemsets.
        Return: list of association rules in the format: [(antecedent, consequent, supp, conf), ...]
    """
    search_items = dict(itemsets)
    rules = []
    for i, (item_set, supp) in enumerate(itemsets):
        if len(item_set) > 1:
            for subset in subsets(item_set):  # for each subset generate a rule
                antecedent = frozenset(subset)
                consequent = item_set - subset
                if len(consequent) != 0:                        
                    conf = supp / search_items[antecedent]
                    if conf >= minconf:
                        rules.append(Association(antecedent, consequent, conf, supp))

    return rules

In [9]:
def genRules(min_conf, minsup, data):
    print("---Mining frequent itemsets---")
    frequent_itemsets = eclat([], minsup, sorted(data.items(), key=lambda item: len(item[1]), reverse=True))
    print("---Generating rules---")
    return deriveRules(frequent_itemsets, min_conf)

In [10]:
%%time

print(len(data))
article_assos_rules = genRules(0.0, 20, data)

438742
---Mining frequent itemsets---
---Generating rules---
Wall time: 58.9 s


In [11]:
len(article_assos_rules)

9226

In [12]:
%%time

# user_assos_rules = genRules(0.0, 20, article_data)

rules_split = []
print(len(article_data))
for i in range(4):
    print(f"Mining rules for split {i}")
    tmp = article_data
    for _, article in article_test_df.iterrows():
        try:
            tmp[article["item_id"]] = article["test_split"][i][0]
        except:
            tmp[article["item_id"]] = article["user_id"]
    r = genRules(0.0,20,tmp)
    rules_split.append(r)    


12860
Mining rules for split 0
---Mining frequent itemsets---
---Generating rules---
Mining rules for split 1
---Mining frequent itemsets---
---Generating rules---
Mining rules for split 2
---Mining frequent itemsets---
---Generating rules---
Mining rules for split 3
---Mining frequent itemsets---
---Generating rules---
Wall time: 34min 26s


In [13]:
for i in range(4):
    print(len(rules_split[i]))

129204
129380
129810
129692


In [14]:
# user_assos_rules = genRules(0.0, 20, article_data)

"""
train: [
a: 1,2,3
b: 2,3,4
c: 1,5,7
]

test: [
d: 2,4,8
e: 1,3,7
]

rules voor train + 1e block van test -> maak recomm met als test test-1e block
rules voor train + 2e block van test -> maak recomm met als test test-2e block
...

"""

'\ntrain: [\na: 1,2,3\nb: 2,3,4\nc: 1,5,7\n]\n\ntest: [\nd: 2,4,8\ne: 1,3,7\n]\n\nrules voor train + 1e block van test -> maak recomm met als test test-1e block\nrules voor train + 2e block van test -> maak recomm met als test test-2e block\n...\n\n'

# normalize support / confidence to better use it later

In [15]:
minc = 1
maxc = 0

mins = 9999999999999
maxs = -9999999999999
for a in article_assos_rules:
    if a.c < minc:
        minc = a.c
    if a.c > maxc:
        maxc = a.c
    
    if a.s < mins:
        mins = a.s
    if a.s > maxs:
        maxs = a.s
            
for a in article_assos_rules:
    a.c = (a.c - minc) / (maxc - minc)
    a.s = (a.s - mins) / (maxs - mins)
    # temp score that's just lhs + weighted average of c, s
    a.score = len(a.left) + sqrt(pow(a.c, 2) + pow(a.s, 2))

article_assos_rules = sorted(article_assos_rules, key = lambda x: x.score, reverse=True)
for i in article_assos_rules:
    print(i)

Conf: 0.96	Supp: 26.00	 {1025, 2954, 132} => {7}
Conf: 0.95	Supp: 20.00	 {562, 20396, 23214} => {7}
Conf: 0.93	Supp: 27.00	 {1025, 3, 132} => {7}
Conf: 0.92	Supp: 34.00	 {1025, 171, 132} => {7}
Conf: 0.92	Supp: 23.00	 {2219, 14364, 7} => {1025}
Conf: 0.92	Supp: 23.00	 {1025, 3, 2219} => {7}
Conf: 0.92	Supp: 23.00	 {3, 2219, 7} => {1025}
Conf: 0.92	Supp: 23.00	 {1025, 3, 3327} => {7}
Conf: 0.92	Supp: 22.00	 {129, 2219, 1025} => {7}
Conf: 0.91	Supp: 21.00	 {1025, 23047, 7} => {132}
Conf: 0.91	Supp: 20.00	 {1025, 562, 3} => {7}
Conf: 0.91	Supp: 20.00	 {1025, 132, 7525} => {7}
Conf: 0.91	Supp: 20.00	 {132, 8758, 7} => {1025}
Conf: 0.88	Supp: 23.00	 {1025, 17084, 132} => {7}
Conf: 0.88	Supp: 36.00	 {1025, 132, 3327} => {7}
Conf: 0.88	Supp: 21.00	 {1025, 132, 23047} => {7}
Conf: 0.88	Supp: 21.00	 {3, 132, 3327} => {7}
Conf: 0.88	Supp: 21.00	 {1025, 3602, 132} => {7}
Conf: 0.87	Supp: 20.00	 {20396, 23214, 7} => {562}
Conf: 0.87	Supp: 20.00	 {132, 7525, 7} => {1025}
Conf: 0.87	Supp: 20.00	 {22

Conf: 0.39	Supp: 23.00	 {14268} => {3327}
Conf: 0.34	Supp: 71.00	 {9833} => {171}
Conf: 0.39	Supp: 31.00	 {32111} => {8758}
Conf: 0.39	Supp: 31.00	 {75017} => {7}
Conf: 0.39	Supp: 21.00	 {25941} => {25044}
Conf: 0.39	Supp: 21.00	 {21475} => {562}
Conf: 0.39	Supp: 21.00	 {19814} => {7}
Conf: 0.39	Supp: 21.00	 {32368} => {7}
Conf: 0.39	Supp: 21.00	 {7689} => {171}
Conf: 0.39	Supp: 21.00	 {55373} => {1334}
Conf: 0.39	Supp: 21.00	 {3905} => {16}
Conf: 0.39	Supp: 26.00	 {63979} => {20911}
Conf: 0.39	Supp: 26.00	 {63979} => {129}
Conf: 0.39	Supp: 26.00	 {67697} => {562}
Conf: 0.39	Supp: 29.00	 {54351} => {1025}
Conf: 0.39	Supp: 29.00	 {3788} => {7}
Conf: 0.38	Supp: 36.00	 {23108} => {7}
Conf: 0.37	Supp: 51.00	 {3602} => {7}
Conf: 0.39	Supp: 24.00	 {10766} => {171}
Conf: 0.38	Supp: 33.00	 {14364} => {132}
Conf: 0.38	Supp: 33.00	 {14364} => {1025, 7}
Conf: 0.38	Supp: 44.00	 {8568} => {171}
Conf: 0.38	Supp: 44.00	 {23214} => {3327}
Conf: 0.39	Supp: 27.00	 {94381} => {171}
Conf: 0.39	Supp: 22.00

Conf: 0.32	Supp: 24.00	 {54351} => {562}
Conf: 0.32	Supp: 24.00	 {3788} => {132}
Conf: 0.32	Supp: 35.00	 {20248} => {7}
Conf: 0.32	Supp: 35.00	 {7525} => {171}
Conf: 0.32	Supp: 32.00	 {37157} => {171}
Conf: 0.32	Supp: 23.00	 {35049} => {562}
Conf: 0.32	Supp: 23.00	 {35049} => {1025}
Conf: 0.32	Supp: 23.00	 {4584} => {7}
Conf: 0.28	Supp: 60.00	 {10615} => {7}
Conf: 0.31	Supp: 43.00	 {8758} => {23214}
Conf: 0.31	Supp: 43.00	 {8758} => {3327}
Conf: 0.32	Supp: 31.00	 {49786} => {129}
Conf: 0.32	Supp: 31.00	 {49786} => {2954}
Conf: 0.32	Supp: 22.00	 {10583} => {25044}
Conf: 0.32	Supp: 22.00	 {20667} => {1025}
Conf: 0.32	Supp: 27.00	 {18271} => {1025}
Conf: 0.32	Supp: 21.00	 {5364} => {5609}
Conf: 0.32	Supp: 21.00	 {5364} => {1334}
Conf: 0.32	Supp: 21.00	 {76443} => {33863}
Conf: 0.32	Supp: 21.00	 {76443} => {562}
Conf: 0.32	Supp: 21.00	 {13971} => {2954}
Conf: 0.32	Supp: 21.00	 {2191} => {171}
Conf: 0.29	Supp: 53.00	 {22512} => {7}
Conf: 0.32	Supp: 30.00	 {21207} => {7}
Conf: 0.32	Supp: 30.

Conf: 0.23	Supp: 22.00	 {21218} => {18088}
Conf: 0.23	Supp: 22.00	 {21218} => {11788}
Conf: 0.17	Supp: 61.00	 {220} => {7}
Conf: 0.23	Supp: 28.00	 {428} => {1334}
Conf: 0.22	Supp: 38.00	 {13300} => {1334}
Conf: 0.23	Supp: 26.00	 {19666} => {23214}
Conf: 0.18	Supp: 59.00	 {586} => {1695}
Conf: 0.18	Supp: 59.00	 {586} => {7}
Conf: 0.23	Supp: 29.00	 {61} => {1334}
Conf: 0.23	Supp: 21.00	 {68365} => {34780}
Conf: 0.23	Supp: 21.00	 {68365} => {35845}
Conf: 0.23	Supp: 21.00	 {68365} => {3602}
Conf: 0.23	Supp: 21.00	 {31390} => {1334}
Conf: 0.23	Supp: 21.00	 {8305} => {1025}
Conf: 0.23	Supp: 21.00	 {8305} => {171}
Conf: 0.06	Supp: 77.00	 {7} => {5010}
Conf: 0.06	Supp: 77.00	 {7} => {1025, 132}
Conf: 0.23	Supp: 23.00	 {10384} => {1025, 7}
Conf: 0.23	Supp: 23.00	 {6497} => {9436}
Conf: 0.23	Supp: 23.00	 {6497} => {1353}
Conf: 0.23	Supp: 23.00	 {2966} => {1025}
Conf: 0.23	Supp: 23.00	 {2081} => {171}
Conf: 0.23	Supp: 25.00	 {3} => {20911}
Conf: 0.23	Supp: 25.00	 {3} => {8758}
Conf: 0.23	Supp: 25

Conf: 0.12	Supp: 21.00	 {11788} => {1356}
Conf: 0.12	Supp: 21.00	 {11788} => {1025, 562}
Conf: 0.11	Supp: 32.00	 {3897} => {3429}
Conf: 0.11	Supp: 32.00	 {3897} => {2089}
Conf: 0.08	Supp: 41.00	 {2954} => {1025, 7}
Conf: 0.12	Supp: 20.00	 {5887} => {5662}
Conf: 0.12	Supp: 20.00	 {5887} => {48}
Conf: 0.11	Supp: 30.00	 {129} => {2219, 7}
Conf: 0.12	Supp: 20.00	 {13300} => {197172}
Conf: 0.12	Supp: 20.00	 {13300} => {76443}
Conf: 0.12	Supp: 20.00	 {13300} => {63979}
Conf: 0.12	Supp: 20.00	 {13300} => {35845}
Conf: 0.12	Supp: 20.00	 {13300} => {25484}
Conf: 0.12	Supp: 20.00	 {13300} => {10384}
Conf: 0.12	Supp: 20.00	 {13300} => {2156}
Conf: 0.12	Supp: 20.00	 {13300} => {20248}
Conf: 0.12	Supp: 20.00	 {13300} => {57443}
Conf: 0.12	Supp: 20.00	 {13300} => {4705, 3327}
Conf: 0.12	Supp: 20.00	 {13300} => {4705, 7}
Conf: 0.12	Supp: 20.00	 {13300} => {5418}
Conf: 0.12	Supp: 20.00	 {13300} => {10615, 7}
Conf: 0.12	Supp: 20.00	 {13300} => {1025, 132}
Conf: 0.12	Supp: 21.00	 {22512} => {9073}
Conf:

Conf: 0.09	Supp: 20.00	 {10615} => {5418}
Conf: 0.09	Supp: 20.00	 {10615} => {13300, 7}
Conf: 0.09	Supp: 25.00	 {129} => {3228}
Conf: 0.09	Supp: 25.00	 {129} => {6305}
Conf: 0.09	Supp: 25.00	 {129} => {3153}
Conf: 0.09	Supp: 25.00	 {129} => {26749}
Conf: 0.09	Supp: 25.00	 {129} => {5111}
Conf: 0.09	Supp: 25.00	 {129} => {1356}
Conf: 0.09	Supp: 25.00	 {129} => {13300, 7}
Conf: 0.09	Supp: 25.00	 {129} => {447}
Conf: 0.07	Supp: 36.00	 {16} => {8689}
Conf: 0.07	Supp: 36.00	 {16} => {9833}
Conf: 0.09	Supp: 25.00	 {48} => {22767}
Conf: 0.09	Supp: 25.00	 {48} => {18088}
Conf: 0.09	Supp: 25.00	 {48} => {22512}
Conf: 0.08	Supp: 30.00	 {220} => {110852}
Conf: 0.08	Supp: 30.00	 {220} => {910}
Conf: 0.07	Supp: 35.00	 {2954} => {10615}
Conf: 0.09	Supp: 26.00	 {70} => {3897}
Conf: 0.09	Supp: 23.00	 {5239} => {1356}
Conf: 0.09	Supp: 23.00	 {5239} => {2089}
Conf: 0.09	Supp: 27.00	 {5010} => {1356}
Conf: 0.07	Supp: 35.00	 {79} => {1334, 7}
Conf: 0.09	Supp: 24.00	 {5609} => {5359}
Conf: 0.09	Supp: 24.00

Conf: 0.04	Supp: 33.00	 {1334} => {15212}
Conf: 0.04	Supp: 33.00	 {1334} => {105}
Conf: 0.04	Supp: 33.00	 {1334} => {7525}
Conf: 0.04	Supp: 33.00	 {1334} => {22767}
Conf: 0.04	Supp: 33.00	 {1334} => {1931}
Conf: 0.04	Supp: 33.00	 {1334} => {13822}
Conf: 0.04	Supp: 33.00	 {1334} => {8758}
Conf: 0.04	Supp: 33.00	 {1334} => {4426}
Conf: 0.04	Supp: 32.00	 {171} => {35319}
Conf: 0.04	Supp: 32.00	 {171} => {9821}
Conf: 0.04	Supp: 32.00	 {171} => {31080}
Conf: 0.04	Supp: 32.00	 {171} => {16570}
Conf: 0.04	Supp: 32.00	 {171} => {75418}
Conf: 0.04	Supp: 32.00	 {171} => {37157}
Conf: 0.04	Supp: 32.00	 {171} => {16319}
Conf: 0.04	Supp: 32.00	 {171} => {14072}
Conf: 0.04	Supp: 32.00	 {171} => {31035}
Conf: 0.04	Supp: 32.00	 {171} => {3602}
Conf: 0.04	Supp: 32.00	 {171} => {3548}
Conf: 0.04	Supp: 32.00	 {171} => {6681}
Conf: 0.04	Supp: 32.00	 {171} => {13300, 7}
Conf: 0.04	Supp: 32.00	 {171} => {562, 7}
Conf: 0.07	Supp: 21.00	 {132} => {1025, 23047, 7}
Conf: 0.07	Supp: 21.00	 {132} => {87318}
Conf:

# ??. Store the association rules as pickle file for further use

In [17]:
import pickle
pickle.dump(article_assos_rules, open("pickle_dumps/article_rules.p", "wb"))
pickle.dump(rules_split, open("pickle_dumps/user_rules.p", "wb"))