In [1]:
from collections import OrderedDict
from collections import Counter
import numpy as np
import pandas as pd
import heapq

In [19]:
visit_popularity = Counter()
purchase_popularity = Counter()

with open('train.txt', 'r') as f:
    for line in f.readlines():
        visits, purchases = line.strip().split(';')
        for visit in visits.split(','):
            visit_popularity[visit] += 1
        if purchases != '':
            for purchase in purchases.split(','):
                purchase_popularity[purchase] += 1



In [21]:
def recommend_by_purchase(items, max_count):
    return heapq.nlargest(max_count, OrderedDict.fromkeys(items), key=lambda x: purchase_popularity.get(x, 0))

def recommend_by_visit(items, max_count):
    return heapq.nlargest(max_count, OrderedDict.fromkeys(items), key=lambda x: visit_popularity.get(x, 0))

In [28]:
def calculate_metrics(recommend, max_count, sessions_file):
    with open(sessions_file, 'r') as f:
        avg_recall = np.zeros(max_count)
        avg_precision = np.zeros(max_count)
        sessions_count = 0
        for line in f.readlines():
            visits, purchases = line.strip().split(';')
            if purchases != '':
                visits = visits.split(',')
                purchases = set(purchases.split(','))
                rec = recommend(visits, max_count)
                rec_hits = np.array(list(map(lambda x: x in purchases, rec)))
                hits = np.zeros(max_count)
                hits[:len(rec_hits)] = rec_hits
                sessions_count += 1
                avg_recall +=  np.cumsum(hits) / len(purchases)
                avg_precision += np.cumsum(hits) / (np.arange(max_count) + 1)

    return pd.DataFrame({
            'k': np.arange(max_count) + 1,
            'avg_recall@k': [round(x, 2) for x in avg_recall / sessions_count],
            'avg_precision@k': [round(x, 2) for x in avg_precision / sessions_count]
    }).set_index('k')

In [29]:
calculate_metrics(recommend_by_purchase, 5, 'train.txt')

Unnamed: 0_level_0,avg_recall@k,avg_precision@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.69,0.8
2,0.84,0.53
3,0.89,0.39
4,0.91,0.31
5,0.93,0.25


In [31]:
calculate_metrics(recommend_by_purchase, 5, 'test.txt')

Unnamed: 0_level_0,avg_recall@k,avg_precision@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.46,0.53
2,0.64,0.38
3,0.73,0.3
4,0.79,0.25
5,0.82,0.21


In [32]:
calculate_metrics(recommend_by_visit, 5, 'train.txt')

Unnamed: 0_level_0,avg_recall@k,avg_precision@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.44,0.51
2,0.63,0.38
3,0.73,0.3
4,0.79,0.25
5,0.82,0.21


In [33]:
calculate_metrics(recommend_by_visit, 5, 'test.txt')

Unnamed: 0_level_0,avg_recall@k,avg_precision@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.42,0.48
2,0.6,0.36
3,0.7,0.29
4,0.76,0.24
5,0.8,0.2
