# Filtering data
We want to emphasize the non-stationary character of the data. To do this we filter out articles that exist for less than 18 hours, because they tend to have constant popularity.

In [2]:
import os
os.chdir('..')
os.getcwd()

'/Users/sbokupripeku/git/work/examples/costly_nonstationary_bandits'

In [8]:
%load_ext autoreload
%autoreload 2

from datetime import timedelta
import time
import numpy as np
import pickle

import dataset
from plotting.average_ctr import get_average_ctr_plot

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Let's find the short lived articles
We do this by checking already gathered average ctr data.
The timestamps were averaged over 4000 points, so its an approximation. But usually there are much more than 4000 timestamps in one hour.

In [10]:
with open('total_click_through_rates.pickle', 'rb') as f:
    article_ctrs = pickle.load(f)

In [11]:
article_lives = []
zero_length = []
for article_id, (ctrs, tss) in article_ctrs.items():
    if not tss:
        zero_length.append(article_id)
        continue
    start_ts = tss[0]
    end_ts = tss[-1]
    life = end_ts - start_ts
    article_lives.append((article_id, life))

In [12]:
sorted_lives = list(sorted(article_lives, key=lambda x: x[1], reverse=False))
sorted_lives[:10]

[('109527', 0.0),
 ('109526', 1027.5750000476837),
 ('109781', 4639.049999952316),
 ('109688', 5115.450000047684),
 ('109523', 5183.549999952316),
 ('109593', 5207.924999952316),
 ('109677', 6392.700000047684),
 ('109759', 6874.050000190735),
 ('109695', 7371.225000143051),
 ('109570', 7505.399999856949)]

In [13]:
short_lived_ids = [art_id for art_id, life in sorted_lives if life < 18*3600]

In [16]:
filtered_ids = [
    '109505', # zero-length
    
]
filtered_ids += short_lived_ids
filtered_ctrs = {k:v for k,v in article_ctrs.items() if k not in filtered_ids}

In [18]:
import plotly.io as pio
pio.renderers.default = 'browser'
get_average_ctr_plot(filtered_ctrs)

In [77]:
# Lets take may 4 as small data, 4,5 as medium, 4,5,6 as large

In [81]:
with open('filtered_article_ids.pickle', 'wb') as f:
        pickle.dump(filtered_ids, f)

## Saving subsets of data
Here we save subsets of data for later use.
Small data for quick testing, medium and big to verify results on a larger timescale.

In [19]:
%%time
small_data = dataset.Dataset()
small_data.fill_yahoo_events(
    filenames=["dataset/R6/ydata-fp-td-clicks-v1_0.20090504"],
    filtered_ids=filtered_ids
)

3533535 events with 27 articles, from files  ['dataset/R6/ydata-fp-td-clicks-v1_0.20090504']
CPU times: user 4min 51s, sys: 16.1 s, total: 5min 7s
Wall time: 5min 22s


In [20]:
%%time
with open("dataset/R6/subsample/data_04.pickle", "wb") as f:
    pickle.dump(small_data, f)
del small_data

CPU times: user 8.67 s, sys: 6.12 s, total: 14.8 s
Wall time: 23.3 s


In [21]:
%%time
medium_data = dataset.Dataset()
medium_data.fill_yahoo_events(
    filenames=[
        "dataset/R6/ydata-fp-td-clicks-v1_0.20090504",
        "dataset/R6/ydata-fp-td-clicks-v1_0.20090505"
    ],
    filtered_ids=filtered_ids
)

6776711 events with 37 articles, from files  ['dataset/R6/ydata-fp-td-clicks-v1_0.20090504', 'dataset/R6/ydata-fp-td-clicks-v1_0.20090505']
CPU times: user 9min 38s, sys: 35.1 s, total: 10min 13s
Wall time: 11min 8s


In [22]:
%%time
with open("dataset/R6/subsample/data_04_05.pickle", "wb") as f:
    pickle.dump(medium_data, f)
del medium_data

CPU times: user 14.2 s, sys: 14.8 s, total: 29 s
Wall time: 1min 8s


In [23]:
%%time
big_data = dataset.Dataset()
big_data.fill_yahoo_events(
    filenames=[
        "dataset/R6/ydata-fp-td-clicks-v1_0.20090504", 
        "dataset/R6/ydata-fp-td-clicks-v1_0.20090505",
        "dataset/R6/ydata-fp-td-clicks-v1_0.20090506",
    ],
    filtered_ids=filtered_ids
)

10036576 events with 48 articles, from files  ['dataset/R6/ydata-fp-td-clicks-v1_0.20090504', 'dataset/R6/ydata-fp-td-clicks-v1_0.20090505', 'dataset/R6/ydata-fp-td-clicks-v1_0.20090506']
CPU times: user 14min 33s, sys: 55 s, total: 15min 28s
Wall time: 16min 21s


In [24]:
%%time
with open("dataset/R6/subsample/data_04_05_06.pickle", "wb") as f:
    pickle.dump(big_data, f)
del big_data

CPU times: user 21.1 s, sys: 17.8 s, total: 38.8 s
Wall time: 54.7 s
