In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import re
from models import *
from plot_utils import *
from math import ceil
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


In [None]:
plt.rcParams['figure.figsize'] = (10, 6.5)
plt.rcParams['font.size'] = 29
plt.rcParams['font.weight'] = 500
plt.rcParams['axes.titlesize'] = 30
plt.rcParams['axes.labelsize'] = 29
plt.rcParams['xtick.labelsize'] = 25
plt.rcParams['ytick.labelsize'] = 25
plt.rcParams['legend.fontsize'] = 29
plt.rcParams['lines.markersize'] = 11
plt.rcParams['lines.linewidth'] = 4

sns.set_style('ticks')
sns.set_palette('Paired', 10)

In [None]:
rmse_df = pd.read_csv('rmse.csv')
sampling_df = pd.read_csv('../decision-rule/simdex-decision-rule.csv')

simdex_netflix_df = pd.read_csv('timing-results/netflix-simdex-timing.csv')
blocked_mm_netflix_df = pd.read_csv('timing-results/netflix-blocked_mm-timing.csv')

simdex_kdd_df = pd.read_csv('timing-results/kdd-simdex-timing.csv')
blocked_mm_kdd_df = pd.read_csv('timing-results/kdd-blocked_mm-timing.csv')

simdex_r2_df = pd.read_csv('timing-results/r2-simdex-timing.csv')
blocked_mm_r2_df = pd.read_csv('timing-results/r2-blocked_mm-timing.csv')

simdex_lastfm_df = pd.read_csv('timing-results/lastfm-simdex-timing.csv')
blocked_mm_lastfm_df = pd.read_csv('timing-results/lastfm-blocked_mm-timing.csv')

simdex_glove_df = pd.read_csv('timing-results/glove-simdex-timing.csv')
blocked_mm_glove_df = pd.read_csv('timing-results/glove-blocked_mm-timing.csv')

simdex_df = pd.concat([simdex_netflix_df, simdex_kdd_df, simdex_r2_df, simdex_lastfm_df, simdex_glove_df])
lemp_df = pd.read_csv('timing-results/lemp-gold-standard-timing.csv')
blocked_mm_df = pd.concat([blocked_mm_netflix_df, blocked_mm_kdd_df, blocked_mm_r2_df,blocked_mm_lastfm_df,
                           blocked_mm_glove_df])
both_df = pd.concat([simdex_df, blocked_mm_df])

fexipro_df_all = pd.read_csv('timing-results/fexipro-orig-timing.csv')
fexipro_df = fexipro_df_all.query('alg == "SIR"')
fexipro_si_df = fexipro_df_all.query('alg == "SI"')

fexipro_estimates = pd.read_csv('runtime-estimates/fexipro.csv')
lemp_estimates = pd.read_csv('runtime-estimates/lemp.csv')
simdex_estimates = pd.read_csv('runtime-estimates/simdex.csv')
blocked_mm_esitmates = pd.read_csv('runtime-estimates/blocked_mm.csv')

In [None]:
simdex_df

In [None]:
print(simdex_df.query('model == "nomad-R2-50-reg-0.000001" and K == 1')['comp_time'].min())
print(lemp_df.query('model == "nomad-R2-50-reg-0.000001" and K == 1')['comp_time'].min())
print(blocked_mm_df.query('model == "nomad-R2-50-reg-0.000001" and K == 1')['comp_time'].min())


In [None]:
temp = simdex_df.sort_values(by='comp_time').groupby(
        ['model', 'K'], as_index=False).first().drop(['num_threads','num_latent_factors',
                                                     'num_bins','sample_percentage',
                                                      'num_iters', 'parse_time'], axis=1)
max((temp['cluster_time'] / temp['comp_time'])*100)

In [None]:
print(blocked_mm_df.query('model == "nomad-Netflix-25-reg-0.05" and K == 1')['comp_time'].min())
print(lemp_df.query('model == "nomad-Netflix-25-reg-0.05" and K == 1')['comp_time'].min())
print(fexipro_si_df.query('model == "nomad-Netflix-25-reg-0.05" and K == 1')['comp_time'].min())

In [None]:
palette = sns.color_palette('Paired', 4)
blue_palette = sns.color_palette('Blues', 10)
green_palette = sns.color_palette('Greens', 10)
f_u_palette = [palette[1], blue_palette[-5], palette[3], green_palette[-5], palette[2]]
# appetizer_palette = [palette[1], palette[3]]
appetizer_palette = f_u_palette[0:1] + f_u_palette[2:]
# rmse_reg_palette = [palette[1], palette[0], palette[3]]
rmse_reg_palette = f_u_palette #[blue_palette[-5], palette[0], palette[3], green_palette[-5], palette[2]]
# rmse_reg_palette = [f_u_palette[3], f_u_palette[1], f_u_palette[4], f_u_palette[2], f_u_palette[5]]

sns.palplot(sns.color_palette('Paired', 8))
sns.palplot(sns.color_palette('Blues', 10))
sns.palplot(sns.color_palette('Greens', 10))
sns.palplot(sns.diverging_palette(240, 128, n=6))
print('FU Plots')
sns.palplot(f_u_palette)
print('Appetizer plots')
sns.palplot(appetizer_palette)
print('RMSE plots')
sns.palplot(rmse_reg_palette)

In [None]:
lemp_parallel_df = pd.read_csv('timing-results/parallel-experiments/netflix-lemp.csv').query('num_threads < 32')
blocked_mm_parallel_df = pd.read_csv('timing-results/parallel-experiments/netflix-blocked_mm.csv').query('num_threads < 32')
simdex_parallel_df = pd.read_csv('timing-results/parallel-experiments/netflix-simdex.csv').query('num_threads < 32')

shrink_factor = 4
plt.rcParams['figure.figsize'] = (10, 4.1)
plt.rcParams['font.size'] = 25 - shrink_factor
plt.rcParams['font.weight'] = 500
plt.rcParams['axes.titlesize'] = 26 - shrink_factor
plt.rcParams['axes.labelsize'] = 25 - shrink_factor
plt.rcParams['xtick.labelsize'] = 22 - shrink_factor
plt.rcParams['ytick.labelsize'] = 22 - shrink_factor
plt.rcParams['legend.fontsize'] = 24 - shrink_factor
plt.rcParams['lines.markersize'] = 11
plt.rcParams['lines.linewidth'] = 4

with sns.color_palette(f_u_palette):
    fig, ax = plt.subplots()
    ax.plot(blocked_mm_parallel_df['num_threads'], blocked_mm_parallel_df['comp_time'],
            marker='o', label='Blocked MM')
    ax.plot(simdex_parallel_df['num_threads'], simdex_parallel_df['comp_time'],
            marker='^', label='Maximus')
    ax.plot(lemp_parallel_df['num_threads'], lemp_parallel_df['comp_time'],
            marker='s', label='LEMP')
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlabel('Num Threads')
    ax.set_ylabel('Runtime (s), log scale')
    ax.set_title(r'Netflix-DSGD, $f=100$')
    ax.set_xticks(simdex_parallel_df['num_threads'])
    ax.set_xticklabels(simdex_parallel_df['num_threads'])
    ax.legend(bbox_to_anchor=(0,0,1,1.1))
    ax.grid()
    sns.despine()
    save_figure('parallel-experiment')

In [None]:
for model in GLOVE_MODELS:
    f_u_plot_single(simdex_df, lemp_df, blocked_mm_df, fexipro_df,
                    fexipro_si_df, sampling_df, model=model, num_clusters=8)

In [None]:
for model in BPR_GOLD_STANDARD_MODELS:
    f_u_plot_single(simdex_df, lemp_df, blocked_mm_df, fexipro_df,
                    fexipro_si_df, sampling_df, model=model, num_clusters=8)

In [None]:
with sns.color_palette(f_u_palette):
    f_u_plots(simdex_df, lemp_df, blocked_mm_df, fexipro_df, fexipro_si_df, sampling_df,
              GOLD_STANDARD_MODELS, figsize=(28, 17), # 15
              bbox_to_anchor=(0, 0, 1, 0.99), nrows=5, y_title=1.09) # 1.06

In [None]:
plt.rcParams['font.size'] = 18
plt.rcParams['font.weight'] = 500
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['axes.labelsize'] = 17
plt.rcParams['xtick.labelsize'] = 15
plt.rcParams['ytick.labelsize'] = 15
plt.rcParams['legend.fontsize'] = 18

with sns.color_palette(appetizer_palette):
    benchmark_against_blocked_mm_multi(lemp_df, blocked_mm_df, fexipro_df,
            models=['nomad-Netflix-50-reg-0.05', 'nomad-R2-50-reg-0.000001'],
            num_clusters=8, bbox_to_anchor=(0, 0, 1, 1.09), y_title=-.5, figsize=(10, 3))
    
plt.rcParams['figure.figsize'] = (10, 6.5)
plt.rcParams['font.size'] = 29
plt.rcParams['font.weight'] = 500
plt.rcParams['axes.titlesize'] = 30
plt.rcParams['axes.labelsize'] = 29
plt.rcParams['xtick.labelsize'] = 25
plt.rcParams['ytick.labelsize'] = 25
plt.rcParams['legend.fontsize'] = 29
plt.rcParams['lines.markersize'] = 11
plt.rcParams['lines.linewidth'] = 4


In [None]:
with sns.color_palette(appetizer_palette):
    blocked_mm_lemp_fexipro_plot(blocked_mm_df, lemp_df, fexipro_df, fexipro_si_df,
                             'nomad-R2-50-reg-0.001', y_title=1.01, figsize=(7, 5.5), title=r'Yahoo Music R2, $f=50$')
    blocked_mm_lemp_fexipro_plot(blocked_mm_df, lemp_df, fexipro_df, fexipro_si_df,
                             'nomad-Netflix-50-reg-0.05', y_title=1.01, figsize=(7, 5.5),
                             title=r'Netflix Prize, $f=50$')

In [None]:
# custom settings for the line plots
plt.rcParams['font.size'] = 18
plt.rcParams['font.weight'] = 500
plt.rcParams['axes.titlesize'] = 19
plt.rcParams['figure.titlesize'] = 18
plt.rcParams['axes.labelsize'] = 19
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16
plt.rcParams['legend.fontsize'] = 18
plt.rcParams['lines.markersize'] = 10
plt.rcParams['lines.linewidth'] = 4

runtime_estimates_plot([
                        'lemp-paper-KDD-50',
#                         'lemp-paper-Netflix-noav-50',
#                         'nomad-KDD-50-reg-1',
#                         'nomad-Netflix-50-reg-0.05',
#                         'nomad-R2-50-reg-0.000001',
                       ], lemp_estimates, lemp_df, fexipro_estimates, fexipro_df_all, simdex_estimates, simdex_df,
                        blocked_mm_esitmates, blocked_mm_df, markerstyle='X',
                      figsize=(8,3))

# return to normal
plt.rcParams['font.size'] = 29
plt.rcParams['font.weight'] = 500
plt.rcParams['axes.titlesize'] = 32
plt.rcParams['figure.titlesize'] = 32
plt.rcParams['axes.labelsize'] = 29
plt.rcParams['xtick.labelsize'] = 25
plt.rcParams['ytick.labelsize'] = 25
plt.rcParams['legend.fontsize'] = 29
plt.rcParams['lines.markersize'] = 10
plt.rcParams['lines.linewidth'] = 4

In [None]:
# custom settings for the line plots
plt.rcParams['font.size'] = 18
plt.rcParams['font.weight'] = 500
plt.rcParams['axes.titlesize'] = 23
plt.rcParams['figure.titlesize'] = 23
plt.rcParams['axes.labelsize'] = 21
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16
plt.rcParams['legend.fontsize'] = 13
plt.rcParams['lines.markersize'] = 12
plt.rcParams['lines.linewidth'] = 3.5

appetizer_models = [
    ('nomad-Netflix-25', NETFLIX_REGS, (10, 125), r'Netflix-NOMAD, $f=25$'),
    ('nomad-R2-25', R2_REGS, (-25, 80), r'R2-NOMAD, $f=25$')    
]

all_models = [
    ('nomad-Netflix-50', NETFLIX_REGS, (-5, 135), r'Netflix-NOMAD, $f=50$'),
    ('nomad-Netflix-100', NETFLIX_REGS, (-5, 135), r'Netflix-NOMAD, $f=100$'),
    ('nomad-R2-50', R2_REGS[:-1], (15, 150), r'R2-NOMAD, $f=50$'),
    ('nomad-R2-100', R2_REGS[:-1],(60, 120), r'R2-NOMAD, $f=100$'),
    ('nomad-KDD-10', KDD_REGS, (-24, 135), r'KDD-NOMAD, $f=10$'),
]
for add_simdex in [True, False]:
    models = all_models if add_simdex else appetizer_models
    _palette = rmse_reg_palette if add_simdex else appetizer_palette
    for model_prefix, regs, xy_text, title_text in models:
        _simdex_df = simdex_df if add_simdex else None
        fname = 'appetizer-' + model_prefix if not add_simdex else None
        with sns.color_palette(_palette):
            rmse_and_reg_plots(blocked_mm_df, lemp_df, rmse_df, model_prefix, regs,
                               simdex_df=_simdex_df, fexipro_df=fexipro_df, fexipro_si_df=fexipro_si_df,
                               fname=fname, figsize=(5.5, 7),
                               bbox_to_anchor=(0,0,1,1.05), title=add_simdex, title_text=title_text, 
                               y_title=-0.5, annotate=not add_simdex, xy_text=xy_text, linestyle='--', markerstyle='X',
                               include_legend=False)
            rmse_and_reg_legend(add_simdex, linestyle='--', markerstyle='X')
     
# return to normal
plt.rcParams['font.size'] = 29
plt.rcParams['font.weight'] = 500
plt.rcParams['axes.titlesize'] = 32
plt.rcParams['figure.titlesize'] = 32
plt.rcParams['axes.labelsize'] = 29
plt.rcParams['xtick.labelsize'] = 25
plt.rcParams['ytick.labelsize'] = 25
plt.rcParams['legend.fontsize'] = 29
plt.rcParams['lines.markersize'] = 10
plt.rcParams['lines.linewidth'] = 4        
sns.set_palette('Paired', 10)


In [None]:
plt.rcParams['axes.titlesize'] = 23
plt.rcParams['legend.fontsize'] = 21
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20
plt.rcParams['axes.labelsize'] = 21
num_clusters_vs_runtime(simdex_df, ['lemp-paper-Netflix-noav-50',
                                    'nomad-Netflix-50-reg-0.05',
                                    'nomad-R2-10-reg-0.001',
                                    'nomad-R2-100-reg-0',
                                   ], figsize=(25, 5), y_title=-0.3,
                            bbox_to_anchor=(0,0,1,1.05), min_value=0, max_value=512)
plt.rcParams['axes.titlesize'] = 30
plt.rcParams['legend.fontsize'] = 29
plt.rcParams['xtick.labelsize'] = 25
plt.rcParams['ytick.labelsize'] = 25
plt.rcParams['axes.labelsize'] = 29

In [None]:
plt.rcParams['axes.titlesize'] = 23
plt.rcParams['legend.fontsize'] = 21
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20
plt.rcParams['axes.labelsize'] = 21

# TODO: lemp-paper-Netflix-noav-50 and nomad-R2-50-reg-0.000001 don't have all the clusters
batch_size_vs_runtime(simdex_df, [
                                  'nomad-Netflix-50-reg-0.05',
                                  'lemp-paper-Netflix-noav-50',
                                  'nomad-R2-50-reg-0.000001',
                                  'nomad-R2-100-reg-0',
                                  ], figsize=(25,5), bbox_to_anchor=(0,0,1,1.05), y_title=-0.3)
plt.rcParams['axes.titlesize'] = 30
plt.rcParams['legend.fontsize'] = 29
plt.rcParams['xtick.labelsize'] = 25
plt.rcParams['ytick.labelsize'] = 25
plt.rcParams['axes.labelsize'] = 29

In [None]:
plt.rcParams['axes.titlesize'] = 32
plt.rcParams['lines.linewidth'] = 6
with sns.color_palette(f_u_palette):
    point_query_time(['lemp-paper-Netflix-noav-50', 'nomad-Netflix-50-reg-0.05',
                      'nomad-R2-50-reg-0.001', 'nomad-KDD-50-reg-1'],
                     csv_dir='point-query-stats/', figsize=(32, 6.5),
                     sample_fraction=1.0, y_title=-0.38)
plt.rcParams['axes.titlesize'] = 30
plt.rcParams['lines.linewidth'] = 4

In [None]:
plt.rcParams['font.size'] = 25
plt.rcParams['axes.titlesize'] = 26
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['xtick.labelsize'] = 17
plt.rcParams['ytick.labelsize'] = 17
plt.rcParams['legend.fontsize'] = 17
plt.rcParams['lines.markersize'] = 11
plt.rcParams['lines.linewidth'] = 2.50

factor_analysis(figsize=(8,3))

In [None]:
lemp_model_df = lemp_df.query('model.str.contains("lemp-paper-Netflix-noav") and K == 1', engine='python')
lemp_data = lemp_model_df[['model', 'num_latent_factors', 'comp_time']]
lemp_data['index_time'] = lemp_model_df['preproc_time'] + lemp_model_df['index_time']
lemp_data['algo'] = 'LEMP'

shrink_factor = 7
plt.rcParams['figure.figsize'] = (10, 3)
plt.rcParams['font.size'] = 28 - shrink_factor
plt.rcParams['font.weight'] = 500
plt.rcParams['axes.titlesize'] = 30 - shrink_factor
plt.rcParams['axes.labelsize'] = 26 - shrink_factor
plt.rcParams['xtick.labelsize'] = 25 - shrink_factor
plt.rcParams['ytick.labelsize'] = 25 - shrink_factor
plt.rcParams['legend.fontsize'] = 26 - shrink_factor
plt.rcParams['lines.markersize'] = 11
plt.rcParams['lines.linewidth'] = 4


fexipro_model_df = fexipro_df.query('model.str.contains("lemp-paper-Netflix-noav") and K == 1', engine='python')
fexipro_data = fexipro_model_df[['model', 'num_latent_factors', 'comp_time']]
fexipro_data['index_time'] = fexipro_model_df['preproc_time']
fexipro_data['algo'] = 'FEXIPRO'

data = pd.concat([lemp_data, fexipro_data])
data.replace({"model": LABEL_DICT}, inplace=True)
data.sort_values(by='num_latent_factors', inplace=True)

print((data['comp_time'] - data['index_time']) / data['index_time'])

sns.barplot(x='model', y='comp_time', hue='algo', data=data, edgecolor='black')
ax = sns.barplot(x='model', y='index_time', hue='algo', data=data, hatch='\\', edgecolor='black')
ax.legend_.remove()

ax.set_yscale('log')
ax.set_xlabel('')
ax.set_ylabel('Time (s), log scale')
xticklabels = [label.get_text().replace('-DSGD', '',) for label in ax.get_xticklabels()]
ax.set_xticklabels(xticklabels)

lemp_legend = plt.Rectangle((0,0),1,1, edgecolor='none')
fexipro_legend = plt.Rectangle((0,0),1,1, fc=palette[1], edgecolor='none')
index_time_legend = plt.Rectangle((0,0),1,1, fill=False, hatch='\\')

legend = ax.legend([index_time_legend, lemp_legend, fexipro_legend],
                    ['Index Construction', 'LEMP', 'FEXIPRO'],
                    loc='bottom right',                       
                    ncol=1,
                    handletextpad=0.5,
                    columnspacing=1.25,
                    bbox_to_anchor=(0.48, 0.58))

def change_width(ax, new_value) :
    for i, patch in enumerate(ax.patches):
        current_width = patch.get_width()
        diff = current_width - new_value

        # we change the bar width
        patch.set_width(new_value)

        # we recenter the bar
        if i == 0 or i == 6 or i == 1 or i == 7 or i == 2 or i == 8:
            patch.set_x(patch.get_x() + diff)
change_width(ax, .3)

sns.despine()

save_figure('index-construction', (legend,))

plt.rcParams['xtick.labelsize'] = 25
plt.rcParams['ytick.labelsize'] = 25

In [None]:
df = pd.read_csv('user-stats.csv')
fig, (ax0, ax1) = plt.subplots(2, 1)
sns.boxplot(x=df["theta_b"], y=df["num_items_visited"], ax=ax0)
ax1.scatter(x=df['theta_uc'], y=df['num_items_visited'], s=5)
sns.despine()
df.groupby('cluster_id').agg('median').sort_values(by='theta_b')