In [1]:
from math import ceil
import pandas as pd
import numpy as np
import glob
import re

In [2]:
simdex_netflix_df = pd.read_csv('timing-results/netflix-simdex-timing.csv')
blocked_mm_netflix_df = pd.read_csv('timing-results/netflix-blocked_mm-timing.csv')

simdex_kdd_df = pd.read_csv('timing-results/kdd-simdex-timing.csv')
blocked_mm_kdd_df = pd.read_csv('timing-results/kdd-blocked_mm-timing.csv')

simdex_r2_df = pd.read_csv('timing-results/r2-simdex-timing.csv')
blocked_mm_r2_df = pd.read_csv('timing-results/r2-blocked_mm-timing.csv')

simdex_df = pd.concat([simdex_netflix_df, simdex_kdd_df, simdex_r2_df])
blocked_mm_df = pd.concat([blocked_mm_netflix_df, blocked_mm_kdd_df, blocked_mm_r2_df])


In [3]:
# Return stats DataFrame, also saved as a CSV in decision-rule-K-<K>.csv
def generate_decision_rule_table(stats_dir, K=1):
    if stats_dir[-1] != '/': stats_dir += '/'
    csv_fname = 'decision-rule-K-%d.csv' % K
    with open(csv_fname, 'w') as csv_out:
        print >> csv_out, 'model,avg_num_items_visited,num_users,num_items,mm_time,simdex_time'
        for model in glob.iglob('%s/*_user_stats_K-%d_*csv' % (stats_dir, K)):
            df = pd.read_csv(model)
            avg_num_items_visited = np.mean([ceil(v / 4096.0) * 4096 for v in df['num_items_visited']])
            if 'Netflix' in model:
                n = 17770
                u = 480189
            elif 'R2' in model:
                u = 1823179
                n = 136736
            elif 'KDD' in model:
                u = 100990
                n = 626961
            model = model[len(stats_dir):]
            model = re.sub(r'_user_stats_K-%d_\d+.csv' % K, '', model)
            mm_time = blocked_mm_df.query('model == "%s" and K == %d' % (model, K))['comp_time'].min()
            simdex_time = simdex_df.query('model == "%s" and K == %d' % (model, K))['comp_time'].min()
            print >> csv_out, '%s,%d,%d,%d,%f,%f' % (model, avg_num_items_visited, u, n, mm_time, simdex_time)
    return pd.read_csv(csv_fname)

In [4]:
data = generate_decision_rule_table('decision-rule-with-K/', 10)
data

Unnamed: 0,model,avg_num_items_visited,num_users,num_items,mm_time,simdex_time
0,lemp-paper-KDD-50,154009,100990,626961,3959.36,10160.3
1,lemp-paper-Netflix-50,8379,480189,17770,40.8598,27.1678
2,lemp-paper-Netflix-noav-100,15616,480189,17770,56.8851,85.5322
3,lemp-paper-Netflix-noav-10,5358,480189,17770,28.2434,8.82394
4,lemp-paper-Netflix-noav-50,11502,480189,17770,40.8502,45.3343
5,nomad-KDD-10-reg-1,77221,100990,626961,1790.98,556.695
6,nomad-KDD-100-reg-1,408039,100990,626961,4646.53,9999999.0
7,nomad-KDD-25-reg-0.001,254726,100990,626961,2023.16,6910.16
8,nomad-KDD-50-reg-1,388984,100990,626961,3416.58,24487.1
9,nomad-Netflix-10-reg-0.05,5623,480189,17770,25.0189,9.19001


In [5]:
def decision_rule(data, BLOCK_SIZE=4096):
    mm_wins = (data["mm_time"]) > data["simdex_time"]
    w_hat = data["avg_num_items_visited"]
    n = data['num_items']
    ratio = (w_hat - BLOCK_SIZE)/(n - BLOCK_SIZE)
    models = data["model"]
    delta = data["mm_time"] - data["simdex_time"]
    labeled = zip(ratio, mm_wins, models, delta)
    labeled.sort(key = lambda x: x[0])
    for r in labeled:
        print r

In [6]:
decision_rule(data)

(0.0, True, 'nomad-R2-100-reg-0.001', 1564.4826)
(0.043222255729794934, True, 'nomad-R2-50-reg-0.001', 915.93200000000013)
(0.044059107358262968, True, 'nomad-R2-10-reg-0.001', 506.19399999999996)
(0.074464716525934865, True, 'nomad-R2-25-reg-0.001', 486.74000000000001)
(0.092291940909755746, True, 'lemp-paper-Netflix-noav-10', 19.419460000000001)
(0.11167178587099605, True, 'nomad-Netflix-10-reg-0.05', 15.828889999999998)
(0.11740104195933308, True, 'nomad-KDD-10-reg-1', 1234.2849999999999)
(0.24068297303589059, False, 'lemp-paper-KDD-50', -6200.9399999999987)
(0.31322217346789527, True, 'lemp-paper-Netflix-50', 13.692)
(0.40238253875237812, False, 'nomad-KDD-25-reg-0.001', -4887.0)
(0.51543074447857251, True, 'nomad-Netflix-25-reg-0.05', 2.3318000000000012)
(0.54161181804885183, False, 'lemp-paper-Netflix-noav-50', -4.484099999999998)
(0.61793165453188093, False, 'nomad-KDD-50-reg-1', -21070.519999999997)
(0.64852415852552314, False, 'nomad-KDD-100-reg-1', -9995352.4700000007)
(0.697