# Filtering data
We want to emphasize the non-stationary character of the data. To do this we filter out articles that exist for less than 18 hours, because they tend to have constant popularity.

In [1]:
import os
os.chdir('../..')
os.getcwd()

'/Users/sbokupripeku/git/work/examples/costly_nonstationary_bandits'

In [2]:
%load_ext autoreload
%autoreload 2

from datetime import timedelta
import time
import numpy as np
import pickle

import dataset
from plotting.average_ctr import get_average_ctr_plot

### Let's find the short lived articles
We do this by checking already gathered average ctr data.
The timestamps were averaged over 4000 points, so its an approximation. But usually there are much more than 4000 timestamps in one hour.

In [3]:
with open('dataset/r6b/total_click_through_rates.pickle', 'rb') as f:
    article_ctrs = pickle.load(f)

In [4]:
article_lives = []
zero_length = []
for article_id, (ctrs, tss) in article_ctrs.items():
    if not tss:
        zero_length.append(article_id)
        continue
    start_ts = tss[0]
    end_ts = tss[-1]
    life = end_ts - start_ts
    article_lives.append((article_id, life))

In [6]:
sorted_lives = list(sorted(article_lives, key=lambda x: x[1], reverse=False))
sorted_lives[-10:]

[('id-583860', 126701.17425012589),
 ('id-569364', 128819.20175004005),
 ('id-583812', 133423.28824996948),
 ('id-575000', 136402.3440001011),
 ('id-555317', 139422.9059998989),
 ('id-579677', 144995.2582499981),
 ('id-607161', 150649.08350014687),
 ('id-605518', 175181.74850010872),
 ('id-605378', 177596.73150014877),
 ('id-600430', 179014.5910000801)]

In [7]:
short_lived_ids = [art_id for art_id, life in sorted_lives if life < 18*3600]

In [8]:
filtered_ids = []
filtered_ids += short_lived_ids
filtered_ctrs = {k:v for k,v in article_ctrs.items() if k not in filtered_ids}

In [9]:
import plotly.io as pio
pio.renderers.default = 'browser'
get_average_ctr_plot(filtered_ctrs)

In [10]:
len(filtered_ids)

284

In [11]:
with open('dataset/r6b/filtered_article_ids.pickle', 'wb') as f:
        pickle.dump(filtered_ids, f)

## Saving subsets of data
Here we save subsets of data for later use.
Small data for quick testing, medium and big to verify results on a larger timescale.

In [15]:
%%time
small_data_06 = dataset.Dataset()
small_data_06.fill_yahoo_events_second_version_r6b(
    filenames=["dataset/r6b/ydata-fp-td-clicks-v2_0.20111006"],
    filtered_ids=filtered_ids
)

1171318 events with 58 articles, from files  ['dataset/r6b/ydata-fp-td-clicks-v2_0.20111006']
CPU times: user 4min 37s, sys: 1.98 s, total: 4min 39s
Wall time: 14min 48s


In [16]:
len(small_data_06.events)

1171318

In [17]:
len(small_data_06.articles)

58

In [20]:
%%time
import gc
with open("dataset/r6b/subsample/data_06.pickle", "wb") as f:
    gc.disable()
    pickle.dump(small_data_06, f, protocol=-1)
    gc.enable()
del small_data_06

CPU times: user 4.79 s, sys: 817 ms, total: 5.61 s
Wall time: 6.22 s


In [12]:
%%time
medium_data_06_07 = dataset.Dataset()
medium_data_06_07.fill_yahoo_events_second_version_r6b(
    filenames=[
        "dataset/r6b/ydata-fp-td-clicks-v2_0.20111006",
        "dataset/r6b/ydata-fp-td-clicks-v2_0.20111007",
    ],
    filtered_ids=filtered_ids
)

2834841 events with 98 articles, from files  ['dataset/r6b/ydata-fp-td-clicks-v2_0.20111006', 'dataset/r6b/ydata-fp-td-clicks-v2_0.20111007']
CPU times: user 12min 33s, sys: 5.89 s, total: 12min 39s
Wall time: 12min 44s


In [13]:
len(medium_data_06_07.events)

2834841

In [14]:
len(medium_data_06_07.articles)

98

In [15]:
%%time
import gc
with open("dataset/r6b/subsample/data_06_07.pickle", "wb") as f:
    gc.disable()
    pickle.dump(medium_data_06_07, f, protocol=-1)
    gc.enable()
del medium_data_06_07

CPU times: user 13.3 s, sys: 5.13 s, total: 18.5 s
Wall time: 21.4 s


In [16]:
%%time
small_data_10 = dataset.Dataset()
small_data_10.fill_yahoo_events_second_version_r6b(
    filenames=[
        "dataset/r6b/ydata-fp-td-clicks-v2_0.20111010",
    ],
    filtered_ids=filtered_ids
)

1751665 events with 50 articles, from files  ['dataset/r6b/ydata-fp-td-clicks-v2_0.20111010']
CPU times: user 5min 15s, sys: 1.85 s, total: 5min 17s
Wall time: 5min 19s


In [17]:
len(small_data_10.events)

1751665

In [18]:
len(small_data_10.articles)

50

In [19]:
%%time
import gc
with open("dataset/r6b/subsample/data_10.pickle", "wb") as f:
    gc.disable()
    pickle.dump(small_data_10, f, protocol=-1)
    gc.enable()
del small_data_10

CPU times: user 7.94 s, sys: 1.96 s, total: 9.91 s
Wall time: 11 s


In [20]:
%%time
medium_data_10_11 = dataset.Dataset()
medium_data_10_11.fill_yahoo_events_second_version_r6b(
    filenames=[
        "dataset/r6b/ydata-fp-td-clicks-v2_0.20111010",
        "dataset/r6b/ydata-fp-td-clicks-v2_0.20111011",
    ],
    filtered_ids=filtered_ids
)

3033740 events with 79 articles, from files  ['dataset/r6b/ydata-fp-td-clicks-v2_0.20111010', 'dataset/r6b/ydata-fp-td-clicks-v2_0.20111011']
CPU times: user 10min 21s, sys: 6.11 s, total: 10min 28s
Wall time: 10min 33s


In [21]:
len(medium_data_10_11.events)

3033740

In [22]:
len(medium_data_10_11.articles)

79

In [None]:
%%time
import gc
with open("dataset/r6b/subsample/data_10_11.pickle", "wb") as f:
    gc.disable()
    pickle.dump(medium_data_10_11, f, protocol=-1)
    gc.enable()
del medium_data_10_11

CPU times: user 14 s, sys: 5.96 s, total: 20 s
Wall time: 25.9 s
