In [1]:
from math import ceil
import pandas as pd
import numpy as np
import glob
import re

In [2]:
simdex_netflix_df = pd.read_csv('timing-results/netflix-simdex-timing.csv')
blocked_mm_netflix_df = pd.read_csv('timing-results/netflix-blocked_mm-timing.csv')

simdex_kdd_df = pd.read_csv('timing-results/kdd-simdex-timing.csv')
blocked_mm_kdd_df = pd.read_csv('timing-results/kdd-blocked_mm-timing.csv')

simdex_r2_df = pd.read_csv('timing-results/r2-simdex-timing.csv')
blocked_mm_r2_df = pd.read_csv('timing-results/r2-blocked_mm-timing.csv')

simdex_df = pd.concat([simdex_netflix_df, simdex_kdd_df, simdex_r2_df])
blocked_mm_df = pd.concat([blocked_mm_netflix_df, blocked_mm_kdd_df, blocked_mm_r2_df])


In [3]:
# Return stats DataFrame, also saved as a CSV in decision-rule-K-<K>.csv
def generate_decision_rule_table(stats_dir, K=1):
    if stats_dir[-1] != '/': stats_dir += '/'
    csv_fname = 'decision-rule-K-%d.csv' % K
    with open(csv_fname, 'w') as csv_out:
        print('model,avg_num_items_visited,num_users,num_items,mm_time,simdex_time', file=csv_out)
        for model in glob.iglob('%s/*_user_stats_K-%d_*csv' % (stats_dir, K)):
            df = pd.read_csv(model)
            avg_num_items_visited = np.mean([ceil(v / 4096.0) * 4096 for v in df['num_items_visited']])
            if 'Netflix' in model:
                n = 17770
                u = 480189
            elif 'R2' in model:
                u = 1823179
                n = 136736
            elif 'KDD' in model:
                u = 100990
                n = 626961
            model = model[len(stats_dir):]
            model = re.sub(r'_user_stats_K-%d_\d+.csv' % K, '', model)
            mm_time = blocked_mm_df.query('model == "%s" and K == %d' % (model, K))['comp_time'].min()
            simdex_time = simdex_df.query('model == "%s" and K == %d' % (model, K))['comp_time'].min()
            print('%s,%d,%d,%d,%f,%f' % (model, avg_num_items_visited, u, n, mm_time, simdex_time), file=csv_out)
    return pd.read_csv(csv_fname)

In [4]:
data = generate_decision_rule_table('decision-rule-with-K/', 10)
data

Unnamed: 0,model,avg_num_items_visited,num_users,num_items,mm_time,simdex_time
0,lemp-paper-Netflix-50,8379,480189,17770,40.8598,27.1678
1,nomad-KDD-100-reg-1,408039,100990,626961,4646.53,9999999.0
2,nomad-Netflix-10-reg-0.05,5623,480189,17770,25.0189,9.19001
3,lemp-paper-Netflix-noav-10,5358,480189,17770,28.2434,8.82394
4,nomad-Netflix-50-reg-0.05,13832,480189,17770,40.8713,44.5643
5,nomad-R2-100-reg-0.001,4096,1823179,136736,1623.97,59.4874
6,nomad-Netflix-100-reg-0.05,13627,480189,17770,56.6688,76.8572
7,nomad-KDD-50-reg-1,388984,100990,626961,3416.58,24487.1
8,nomad-Netflix-25-reg-0.05,11144,480189,17770,27.0116,24.6798
9,lemp-paper-Netflix-noav-50,11502,480189,17770,40.8502,45.3343


In [5]:
def decision_rule(data, runtime_col='simdex_time', BLOCK_SIZE=4096):
    mm_wins = (data["mm_time"]) > data[runtime_col]
    w_hat = data["avg_num_items_visited"]
    n = data['num_items']
    ratio = (w_hat - BLOCK_SIZE)/(n - BLOCK_SIZE)
    models = data["model"]
    delta = data["mm_time"] - data[runtime_col]
    labeled = list(zip(ratio, mm_wins, models, delta))
    labeled.sort(key = lambda x: x[0])
    for r in labeled:
        print(r)

In [6]:
decision_rule(data)

(0.0, True, 'nomad-R2-100-reg-0.001', 1564.4826)
(0.043222255729794934, True, 'nomad-R2-50-reg-0.001', 915.93200000000013)
(0.044059107358262968, True, 'nomad-R2-10-reg-0.001', 528.4194)
(0.074464716525934865, True, 'nomad-R2-25-reg-0.001', 692.99660000000006)
(0.092291940909755746, True, 'lemp-paper-Netflix-noav-10', 19.419460000000001)
(0.11167178587099605, True, 'nomad-Netflix-10-reg-0.05', 15.828889999999998)
(0.11740104195933308, True, 'nomad-KDD-10-reg-1', 1272.0349999999999)
(0.24068297303589059, False, 'lemp-paper-KDD-50', -6200.9399999999987)
(0.31322217346789527, True, 'lemp-paper-Netflix-50', 13.692)
(0.40238253875237812, False, 'nomad-KDD-25-reg-0.001', -4764.1800000000003)
(0.51543074447857251, True, 'nomad-Netflix-25-reg-0.05', 2.3318000000000012)
(0.54161181804885183, False, 'lemp-paper-Netflix-noav-50', -4.484099999999998)
(0.61793165453188093, False, 'nomad-KDD-50-reg-1', -21070.519999999997)
(0.64852415852552314, False, 'nomad-KDD-100-reg-1', -9995352.4700000007)
(0.6

In [7]:
lemp_netflix_df = pd.read_csv('timing-results/netflix-lemp-timing.csv')
lemp_kdd_df = pd.read_csv('timing-results/kdd-lemp-timing.csv')
lemp_r2_df = pd.read_csv('timing-results/r2-lemp-timing.csv')

lemp_df = pd.concat([lemp_netflix_df, lemp_kdd_df, lemp_r2_df])
lemp_decision_rule_df = pd.read_csv('lemp-decision-rule-stats.csv')

In [8]:
# Return stats DataFrame, also saved as a CSV in lemp-decision-rule-K-<K>.csv
def generate_lemp_decision_rule_table(lemp_decision_rule_df):
    csv_fname = 'lemp-decision-rule.csv'
    with open(csv_fname, 'w') as csv_out:
        print('model,K,avg_num_items_visited,num_users,num_items,mm_time,lemp_time', file=csv_out)
        for _, row in lemp_decision_rule_df.iterrows():
            model = row['model']
            K = row['K']
            avg_num_items_visited = row['num_comparisons'] / row['num_users']
            num_users = 480189 if 'Netflix' in model else 1823179 if 'R2' in model else 100990
            num_items = row['num_items']
            mm_time = blocked_mm_df.query('model == "%s" and K == %d' % (model, K))['comp_time'].min()
            lemp_time = lemp_df.query('model == "%s" and K == %d' % (model, K))['comp_time'].min()
            print('%s,%d,%d,%d,%d,%f,%f' %
                  (model, K, avg_num_items_visited, num_users, num_items, mm_time, lemp_time), file=csv_out)
    return pd.read_csv(csv_fname).sort_values(by=['model', 'K'])

In [9]:
lemp_vs_mm_all = generate_lemp_decision_rule_table(lemp_decision_rule_df)
lemp_vs_mm_K_1 = lemp_vs_mm_all.query('K == 1')
lemp_vs_mm_K_5 = lemp_vs_mm_all.query('K == 5')
lemp_vs_mm_K_10 = lemp_vs_mm_all.query('K == 10')
lemp_vs_mm_K_50 = lemp_vs_mm_all.query('K == 50')

In [17]:
print('Model, K, w / w_hat')
for _, row in lemp_decision_rule_df.iterrows():
    model, K = row['model'], row['K']
    w_hat = row['num_comparisons'] / row['num_users']
    actual_row = lemp_df.query('model == "%s" and K == %d' % (model, K))
    actual_w = actual_row['num_comparisons'] / actual_row['num_users']
    actual_w = actual_w.values[0]
    print(model, K, actual_w, w_hat)

Model, K, w / w_hat
lemp-paper-KDD-50 1 575627.044311 2634.42
lemp-paper-KDD-50 10 624939.939922 10988.812
lemp-paper-KDD-50 5 624924.470949 4159.306
lemp-paper-KDD-50 50 624945.500443 18298.082
lemp-paper-Netflix-50 1 2411.16605128 3461.175
lemp-paper-Netflix-50 10 3585.07227779 5694.995833333333
lemp-paper-Netflix-50 5 3159.59735438 5029.179166666667
lemp-paper-Netflix-50 50 6019.27348606 8402.989583333334
lemp-paper-Netflix-noav-10 1 1757.52071372 1814.3270833333333
lemp-paper-Netflix-noav-10 10 2759.82596019 2753.0916666666667
lemp-paper-Netflix-noav-10 5 2335.91705766 2350.2395833333335
lemp-paper-Netflix-noav-10 50 4387.39504029 4585.814583333334
lemp-paper-Netflix-noav-100 1 8302.66625016 10001.010416666666
lemp-paper-Netflix-noav-100 10 10066.7733726 12031.960416666667
lemp-paper-Netflix-noav-100 5 9321.96957656 11473.489583333334
lemp-paper-Netflix-noav-100 50 11941.3990137 13405.375
lemp-paper-Netflix-noav-50 1 4466.08065991 6497.977083333333
lemp-paper-Netflix-noav-50 10 688

In [10]:
decision_rule(lemp_vs_mm_K_1, runtime_col='lemp_time', BLOCK_SIZE=0)

(0.0042146629949708862, False, 'lemp-paper-KDD-50', -15008.990000000002)
(0.018151532655637712, True, 'nomad-KDD-10-reg-1', 1682.903)
(0.024865551610420492, False, 'nomad-KDD-50-reg-1', -381.09000000000015)
(0.025260355721975193, True, 'nomad-R2-10-reg-0.001', 526.12789999999995)
(0.025457816522349637, True, 'nomad-R2-100-reg-0', 1247.3710000000001)
(0.034796981043763164, True, 'nomad-R2-50-reg-0.000001', 771.64200000000005)
(0.041035279663000232, True, 'nomad-R2-25-reg-0.001', 515.81599999999992)
(0.071679672811583439, True, 'nomad-KDD-25-reg-0.001', 468.50999999999999)
(0.078600104646529945, False, 'nomad-KDD-100-reg-1', -2541.3400000000001)
(0.10208216094541361, True, 'lemp-paper-Netflix-noav-10', 13.022839999999999)
(0.1947664603263928, False, 'lemp-paper-Netflix-50', -3.306200000000004)
(0.36561620709060216, False, 'lemp-paper-Netflix-noav-50', -41.0)
(0.40320765334833991, False, 'nomad-Netflix-100-reg-0.05', -104.7833)
(0.44113674732695557, False, 'nomad-Netflix-50-reg-0.05', -52

In [11]:
decision_rule(lemp_vs_mm_K_5, runtime_col='lemp_time', BLOCK_SIZE=0)

(0.0066548152604722537, False, 'lemp-paper-KDD-50', -19834.079999999998)
(0.029750760589749591, True, 'nomad-R2-100-reg-0', 1173.9399999999998)
(0.030745377954598644, True, 'nomad-R2-10-reg-0.001', 537.40020000000004)
(0.03372530445899824, True, 'nomad-KDD-10-reg-1', 1667.3299999999999)
(0.048831324596302364, True, 'nomad-R2-50-reg-0.000001', 744.50099999999998)
(0.052188158202667916, True, 'nomad-R2-25-reg-0.001', 504.22700000000009)
(0.063535164594270679, False, 'nomad-KDD-50-reg-1', -2474.27)
(0.092736986787975567, False, 'nomad-KDD-25-reg-0.001', -216.27999999999997)
(0.13224535734383794, True, 'lemp-paper-Netflix-noav-10', 13.575200000000001)
(0.28300506471581316, False, 'lemp-paper-Netflix-50', -22.191199999999995)
(0.321812081073859, False, 'nomad-KDD-100-reg-1', -4060.2000000000007)
(0.43849184018007881, False, 'lemp-paper-Netflix-noav-50', -58.918000000000006)
(0.46651660101294318, False, 'nomad-Netflix-100-reg-0.05', -135.2886)
(0.49814293753517164, False, 'nomad-Netflix-50-r

In [12]:
decision_rule(lemp_vs_mm_K_10, runtime_col='lemp_time', BLOCK_SIZE=0)

(0.017581897110379687, False, 'lemp-paper-KDD-50', -20239.34)
(0.033729230049145797, True, 'nomad-R2-100-reg-0', 1135.2749999999999)
(0.040583332399941753, True, 'nomad-KDD-10-reg-1', 1470.7930000000001)
(0.045905979405569855, True, 'nomad-R2-10-reg-0.001', 515.87819999999999)
(0.056013047039550665, True, 'nomad-R2-50-reg-0.000001', 726.79200000000014)
(0.057687807161245025, True, 'nomad-R2-25-reg-0.001', 487.43800000000005)
(0.09340262832400742, False, 'nomad-KDD-50-reg-1', -2633.6999999999998)
(0.12002348946574266, False, 'nomad-KDD-25-reg-0.001', -878.85000000000014)
(0.15492402926280247, True, 'lemp-paper-Netflix-noav-10', 15.594600000000002)
(0.32042768711311198, False, 'lemp-paper-Netflix-50', -31.3461)
(0.35773592272157784, False, 'nomad-KDD-100-reg-1', -6399.170000000001)
(0.4800225098480585, False, 'lemp-paper-Netflix-noav-50', -68.647800000000004)
(0.5051209904333146, False, 'nomad-Netflix-100-reg-0.05', -158.39920000000001)
(0.55813168261114232, False, 'nomad-Netflix-50-reg-

In [13]:
decision_rule(lemp_vs_mm_K_50, runtime_col='lemp_time', BLOCK_SIZE=0)

(0.029278626986323948, False, 'lemp-paper-KDD-50', -16502.009999999998)
(0.04153258834542476, True, 'nomad-R2-100-reg-0', 947.61199999999997)
(0.054799028785396678, True, 'nomad-R2-10-reg-0.001', 507.45700000000005)
(0.067078165223496369, True, 'nomad-R2-50-reg-0.000001', 589.06499999999994)
(0.075270296866524478, True, 'nomad-KDD-10-reg-1', 1367.4250000000002)
(0.084535162649192599, True, 'nomad-R2-25-reg-0.001', 412.77700000000004)
(0.13485961523999099, False, 'nomad-KDD-50-reg-1', -5052.5200000000004)
(0.19965085821355252, False, 'nomad-KDD-25-reg-0.001', -1901.3899999999999)
(0.23772043375506632, False, 'nomad-KDD-100-reg-1', -10336.790000000001)
(0.25801913337084975, True, 'lemp-paper-Netflix-noav-10', 12.027299999999997)
(0.47281935846933032, False, 'lemp-paper-Netflix-50', -63.919200000000004)
(0.59206527855936975, False, 'lemp-paper-Netflix-noav-50', -88.710599999999999)
(0.63190770962296006, False, 'nomad-Netflix-100-reg-0.05', -205.40449999999998)
(0.67017445132245357, False,