# E-commerce ranking task

*by Ilya Zakharkin (2017)*

In [285]:
from collections import Counter
from collections import OrderedDict

import numpy as np
import pandas

import heapq

### Counting popularities of each item (id)

In [286]:
view_popularity = Counter()
purchase_popularity = Counter()

with open('./coursera_sessions_train.txt', 'r') as f:
    for line in f.readlines():
        views, purchases = line.strip().split(';')
        for view in views.split(','):
            view_popularity[view] += 1
        if purchases != '':
            for purchase in purchases.split(','):
                purchase_popularity[purchase] += 1

### Models

In [287]:
def recommend_by_purchase(items, max_count):
    return heapq.nlargest(max_count, OrderedDict.fromkeys(items), key=lambda x: purchase_popularity.get(x, 0))

def recommend_by_view(items, max_count):
    return heapq.nlargest(max_count, OrderedDict.fromkeys(items), key=lambda x: view_popularity.get(x, 0))

### Applying the models (sortings)

Let`s calculate **AverageRecall@1, AveragePrecision@1, AverageRecall@5, AveragePrecision@5** both on **training** and on **test** sets with **views_counts** and **purchases_counts**:

In [288]:
# def process_sample_with_model(X, model):
#     '''Applies model "model" to sample X and returns results of calculating default recommender metrics.
    
#     '''
#     if model == 'views':
#         counter = views_counter
#     elif model == 'purchases':
#         counter = purchases_counter
#     else:
#         return 'Wrong model parameter. Available are: "views", "purchases"'
    
#     metrics = [recall_k, precision_k]
#     k_values = [1, 5]
    
#     metrics_dict = defaultdict(lambda : defaultdict(list))

#     for id_view_list, id_purch_list in list(zip(X[0], X[1])):
#         view_ids = list(set(filter(lambda x: x != 'nan', str(id_view_list).split(','))))
#         purch_ids = list(filter(lambda x: x != 'nan', str(id_purch_list).split(',')))

#         pred_ids = sorted(view_ids, key=lambda x: counter[x], reverse=True)

#         if len(purch_ids) != 0:
#             for metric in metrics:
#                 for k in k_values:
#                         metrics_dict[metric][k].append(metric(k, pred_ids, purch_ids, len(view_ids)))
    
#     for metric_key in metrics_dict:
#         for k in k_values:
#             metrics_dict[metric_key][k] = round(np.mean(metrics_dict[metric_key][k]), 2)
    
#     return metrics_dict

In [289]:
def calculate_metrics(recommend, max_count, sessions_file):
    with open(sessions_file, 'r') as f:
        avg_recall = np.zeros(max_count)
        avg_precision = np.zeros(max_count)
        sessions_count = 0
        for line in f.readlines():
            views, purchases = line.strip().split(';')
            if purchases != '':
                views = views.split(',')
                purchases = set(purchases.split(','))
                rec = recommend(views, max_count)
                rec_hits = np.array(list(map(lambda x: x in purchases, rec)))
                hits = np.zeros(max_count)
                hits[:len(rec_hits)] = rec_hits
                sessions_count += 1
                avg_recall +=  np.cumsum(hits) / len(purchases)
                avg_precision += np.cumsum(hits) / (np.arange(max_count) + 1)

    return pandas.DataFrame({
            'k': np.arange(max_count) + 1,
            'avg_recall@k': [round(x, 2) for x in avg_recall / sessions_count],
            'avg_precision@k': [round(x, 2) for x in avg_precision / sessions_count]
    }).set_index('k')


#### Training set

In [290]:
views_popularity_train = calculate_metrics(recommend_by_view, 5, './coursera_sessions_train.txt')
views_popularity_test = calculate_metrics(recommend_by_view, 5, './coursera_sessions_test.txt')

#### Test set

In [291]:
purchases_popularity_train = calculate_metrics(recommend_by_purchase, 5, './coursera_sessions_train.txt')
purchases_popularity_test = calculate_metrics(recommend_by_purchase, 5, './coursera_sessions_test.txt')

### Results

In [295]:
def write_and_print_metrics(filename, df):
    with open(filename, 'w') as file:
        fmt_string = '{0} {1} {2} {3}'
        file.write(fmt_string.format(round(df['avg_recall@k'].values[0], 2), round(df['avg_precision@k'].values[0], 2), 
                   round(df['avg_recall@k'].values[4], 2), round(df['avg_precision@k'].values[4], 2)))

In [296]:
write_and_print_metrics('./views_popularity_train.txt', views_popularity_train)

In [297]:
write_and_print_metrics('./views_popularity_test.txt', views_popularity_test)

In [298]:
write_and_print_metrics('./purchases_popularity_train.txt', purchases_popularity_train)

In [299]:
write_and_print_metrics('./purchases_popularity_test.txt', purchases_popularity_test)