In [None]:
from __future__ import division

from collections import OrderedDict
import time
import copy
import pickle
import os
import random

import pandas as pd
import numpy as np

import sched

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import matplotlib as mpl
mpl.rc('savefig', dpi=300)
mpl.rc('text', usetex=True)

In [None]:
def sample_arrival_times(all_items, arrival_rate, start_time):
    """
    Sample item arrival times for init_data['arrival_time_of_item'],
    which gets passed to the StatelessLQNScheduler constructor

    :param set[str] all_items: A set of item ids
    :param float arrival_rate: The arrival rate for the Poisson process
    :param int start_time: Start time (unix epoch) for the arrival process
    """

    all_items = list(all_items)
    random.shuffle(all_items)
    inter_arrival_times = np.random.exponential(1 / arrival_rate, len(all_items))
    arrival_times = start_time + np.cumsum(inter_arrival_times, axis=0).astype(int)
    return OrderedDict(zip(all_items, arrival_times))

Sanity check

In [None]:
init_data = {
    'arrival_time_of_item' : {0: int(time.time())},
    'review_rates' : [0.25, 0.25, 0.25, 0.25]
}

scheduler = sched.StatefulLQNScheduler(init_data)

history = []

assert scheduler.next_item() == 0

Simulations

In [None]:
global_item_difficulty = 0.0076899999999998905
using_global_difficulty = True

In [None]:
num_items = 50
difficulty_of_item = np.ones(num_items) * global_item_difficulty if using_global_difficulty else np.random.exponential(global_item_difficulty, num_items)

In [None]:
arrival_rate = 0.05
num_timesteps_in_sim = 1000

In [None]:
all_items = range(num_items)
start_time = int(time.time())
init_data = {
    'arrival_time_of_item' : sample_arrival_times(all_items, arrival_rate, start_time),
    'review_rates' : [0.25, 0.25, 0.25, 0.25]
}
scheduler = sched.StatefulLQNScheduler(init_data)

In [None]:
num_decks = len(init_data['review_rates'])

In [None]:
work_rate = 0.19020740740740741
inter_arrival_times = np.random.exponential(1 / work_rate, num_timesteps_in_sim)
timesteps = int(time.time()) + np.cumsum(inter_arrival_times, axis=0).astype(int)

In [None]:
history = []

deck_of_item = {item: 1 for item in all_items}
latest_timestamp_of_item = {item: 0 for item in all_items}

for current_time in timesteps:
    try:
        next_item = scheduler.next_item(current_time=current_time)
    except sched.ExhaustedError:
        continue
    
    delay = current_time - latest_timestamp_of_item[next_item]
    latest_timestamp_of_item[next_item] = current_time
    
    deck = deck_of_item[next_item]
    outcome = 1 if np.random.random() < np.exp(-difficulty_of_item[next_item] * delay / deck) else 0
    
    deck_of_item[next_item] = max(1, deck + 2 * outcome - 1)

    history.append({'item_id' : next_item, 'outcome' : outcome, 'timestamp' : current_time})
    scheduler.update(next_item, outcome, current_time)

In [None]:
df = pd.DataFrame(history)

In [None]:
np.mean(df['outcome'])

In [None]:
def deck_promotion_rates(init_data, history):
    """
    Compute the observed rates at which items move from deck i to deck i+1
    
    :param pd.DataFrame history: The logs for a single user
    :rtype: list[float]
    :return: The average promotion rate (items per second) for each deck
    """
    
    deck_of_item = {item: 1 for item in init_data['arrival_time_of_item']}
    num_decks = len(init_data['review_rates'])
    num_promotions_of_deck = {deck: 0 for deck in xrange(1, num_decks + 1)}
    
    for ixn in history:
        item = ixn['item_id']
        outcome = ixn['outcome']
        current_deck = deck_of_item[item]
        if outcome == 1:
            if current_deck >= 1 and current_deck <= num_decks:
                num_promotions_of_deck[current_deck] += 1
            deck_of_item[item] += 1
        elif outcome == 0 and current_deck > 1:
            deck_of_item[item] -= 1
            
    duration = max(ixn['timestamp'] for ixn in history) - min(ixn['timestamp'] for ixn in history)
    promotion_rate_of_deck = {deck: (num_promotions / (1 + duration)) for deck, num_promotions in num_promotions_of_deck.iteritems()}
    return promotion_rate_of_deck

In [None]:
deck_promotion_rates(init_data, history)

In [None]:
def run_sim(arrival_rate, num_items, difficulty_of_item, review_rates, work_rate, num_timesteps_in_sim, expected_delays=None):
    all_items = range(num_items)
    start_time = int(time.time())
    init_data = {
        'arrival_time_of_item' : sample_arrival_times(all_items, arrival_rate, start_time),
        'review_rates' : review_rates
    }
    num_decks = len(init_data['review_rates'])

    scheduler = sched.StatefulLQNScheduler(init_data)

    history = []
    deck_of_item = {item: 1 for item in all_items}
    latest_timestamp_of_item = {item: 0 for item in all_items}
    
    inter_arrival_times = np.random.exponential(1 / work_rate, num_timesteps_in_sim)
    timesteps = int(time.time()) + np.cumsum(inter_arrival_times, axis=0).astype(int)
    for current_time in timesteps:
        try:
            next_item = scheduler.next_item(current_time=current_time)
        except sched.ExhaustedError:
            continue

        deck = deck_of_item[next_item]
        
        if expected_delays is None:
            delay = current_time - latest_timestamp_of_item[next_item]
        else:
            delay = expected_delays[deck-1]
            
        latest_timestamp_of_item[next_item] = current_time

        outcome = 1 if np.random.random() < np.exp(-difficulty_of_item[next_item] * delay / deck) else 0

        deck_of_item[next_item] = max(1, deck + 2 * outcome - 1)

        history.append({'item_id' : next_item, 'outcome' : outcome, 'timestamp' : current_time})
        scheduler.update(next_item, outcome, current_time)

    if history == []:
        return 0
    promotion_rate_of_deck = deck_promotion_rates(init_data, history)
    return promotion_rate_of_deck[num_decks]

In [None]:
num_sim_repeats = 10
num_decks = 5
work_rate = 0.19020740740740741
num_timesteps_in_sim = 500

In [None]:
num_items = 50
difficulty_of_item = np.ones(num_items) * global_item_difficulty if using_global_difficulty else np.random.exponential(global_item_difficulty, num_items)

In [None]:
review_rates = 1 / np.sqrt(np.arange(1, num_decks + 1, 1))
review_rates /= review_rates.sum()

In [None]:
run_sim(1., num_items, difficulty_of_item, review_rates, work_rate, num_timesteps_in_sim)

In [None]:
std_err = lambda x: np.nanstd(x) / np.sqrt(len(x))

Compared simulations with clocked delay to simulations with expected delay

In [None]:
arrival_rates = np.arange(0.001, 0.01+1e-6, 0.0005)

In [None]:
# from lqn_properties.ipynb
expected_delays = [[17.453206595871837,24.868379623154308,30.631707724309987,35.53633858640643,39.8054641266041],
[17.696341041174712,25.31490993777629,31.276758298416564,36.37553028823838,40.78283198316578],
[17.947881769452124,25.78067142450394,31.953797432510804,37.26043652786331,41.812311816790256],
[18.208364768951995,26.267143435291946,32.665607540202345,38.19531662890125,42.898426132100084],
[18.478380664451624,26.775982579619015,33.41534176715943,39.18502270643726,44.046263912657935],
[18.758582853512998,27.30905309733018,34.206593538420954,40.235113303058135,45.26157632076017],
[19.049697309083374,27.868464254224122,35.04348345124263,41.35199548098476,46.55089326271093],
[19.35253449865516,28.456616864290755,35.93076904050019,42.54310435191588,47.92166651275759],
[19.66800402593497,29.07626183577427,36.873985131833074,43.817132512491916,49.382446989802],
[19.997132824152438,29.730574800074148,37.879625741295946,45.184326975412034,50.94310644588686],
[20.341088056711417,30.423252620693738,38.95538339101757,46.65687886094735,52.6151176192934],
[20.70120637006701,31.158640242385705,40.11046932202357,48.24944286621728,54.41191239832036],
[21.07903189611681,31.941900530605306,41.356050198302995,49.97984197003195,56.34934564567404],
[21.47636659403448,32.779246546119,42.70585675343706,51.87004256720628,58.44630453912185],
[21.89533840561766,33.67826630244186,44.17704656829668,53.94752082746742,60.72551543490739],
[22.33849597849283,34.64839565604334,45.79151672701947,56.2473354003538,63.21468259629606],
[22.80894779957474,35.70161481687762,47.577787506248264,58.815000811359255,65.94796098381707],
[23.31056095074395,36.85353870643108,49.57414388140877,61.71127038604331,68.96819946875287],
[23.84828846657145,38.12520137131827,51.83374774747118,65.01964193568408,72.33010917973898]]

In [None]:
assert len(expected_delays) == len(arrival_rates)

In [None]:
ys = [[run_sim(x, num_items, difficulty_of_item, review_rates, work_rate-x, num_timesteps_in_sim) for _ in xrange(num_sim_repeats)] for x in arrival_rates]

In [None]:
exp_ys = [[run_sim(x, num_items, difficulty_of_item, review_rates, work_rate-x, num_timesteps_in_sim, expected_delays=y) for _ in xrange(num_sim_repeats)] for x, y in zip(arrival_rates, expected_delays)]

In [None]:
mean_ys = [np.mean(y) for y in ys]
std_err_ys = [std_err(y) for y in ys]
mean_exp_ys = [np.mean(y) for y in exp_ys]
std_err_exp_ys = [std_err(y) for y in exp_ys]

In [None]:
plt.xlabel(r'Arrival Rate $\lambda_{ext}$ (Items Per Second)')
plt.ylabel(r'Throughput $\lambda_n$ (Items Per Second)')
plt.errorbar(arrival_rates, mean_exp_ys, yerr=std_err_exp_ys, label='Simulated (Expected Delay)')
plt.errorbar(arrival_rates, mean_ys, yerr=std_err_ys, label='Simulated (Clocked Delay)')
plt.plot(np.arange(arrival_rates[0], arrival_rates[-1], 0.0001), np.arange(arrival_rates[0], arrival_rates[-1], 0.0001), '--', label='Theoretical Steady-State Behavior')
plt.legend(loc='best')
plt.savefig(os.path.join('figures', 'lqn', 'clocked-vs-expected-delays.pdf'))
plt.show()

In [None]:
with open(os.path.join('results', 'clocked-vs-expected-delays.pkl'), 'wb') as f:
    pickle.dump((arrival_rates, ys, exp_ys), f, pickle.HIGHEST_PROTOCOL)

Compare theoretical phase transition threshold to simulations

In [None]:
arrival_rates = np.arange(0.001, 0.15, 0.0001)

In [None]:
theoretical_phase_transition_threshold = 0.013526062011718753 # from lqn_properties.ipynb

In [None]:
ys = [[run_sim(x, num_items, difficulty_of_item, review_rates, work_rate-x, num_timesteps_in_sim) for _ in xrange(num_sim_repeats)] for x in arrival_rates]

In [None]:
plt.xlabel(r'Arrival Rate $\lambda_{ext}$ (Items Per Second)')
plt.ylabel(r'Throughput $\lambda_n$ (Items Per Second)')
plt.errorbar(arrival_rates, [np.mean(y) for y in ys], yerr=[std_err(y) for y in ys], label='Simulated (Clocked Delay)')
plt.axvline(x=theoretical_phase_transition_threshold, label=r'Phase Transition Threshold (Theoretical)', linestyle='--')
plt.legend(loc='best')
plt.savefig(os.path.join('figures', 'lqn', 'theoretical-vs-simulated-phase-transition.pdf'))
plt.show()

In [None]:
with open(os.path.join('results', 'theoretical-vs-simulated-phase-transition.pkl'), 'wb') as f:
    pickle.dump((arrival_rates, ys, theoretical_phase_transition_threshold), f, pickle.HIGHEST_PROTOCOL)

Compare simulations of different lengths (i.e., transient vs. steady-state behavior)

In [None]:
arrival_rates = np.arange(0.001, 0.15, 0.0001)
sim_lengths = [500, 1000, 5000, 10000]

In [None]:
num_items = 500
difficulty_of_item = np.ones(num_items) * global_item_difficulty if using_global_difficulty else np.random.exponential(global_item_difficulty, num_items)

In [None]:
ys = [[[run_sim(x, num_items, difficulty_of_item, review_rates, work_rate-x, y) for _ in xrange(num_sim_repeats)] for x in arrival_rates] for y in sim_lengths]

In [None]:
plt.xlabel(r'Arrival Rate $\lambda_{ext}$ (Items Per Second)')
plt.ylabel(r'Throughput $\lambda_n$ (Items Per Second)')
for nts, ds in zip(sim_lengths, ys):
    plt.errorbar(
        arrival_rates, [np.mean(y) for y in ds], yerr=[std_err(y) for y in ds], 
        label='Simulated Session Length = %d Reviews' % nts)
plt.axvline(x=theoretical_phase_transition_threshold, label=r'Phase Transition Threshold (Theoretical)', linestyle='--')
plt.legend(loc='best')
plt.savefig(os.path.join('figures', 'lqn', 'throughput-vs-arrival-rate-vs-simulated-session-length.pdf'))
plt.show()

In [None]:
with open(os.path.join('results', 'throughput-vs-arrival-rate-vs-simulated-session-length.pkl'), 'wb') as f:
    pickle.dump((arrival_rates, ys, sim_lengths), f, pickle.HIGHEST_PROTOCOL)