In [1]:
import pandas as pd
import numpy as np
import sqlite3
import os.path
from sqlalchemy import create_engine
from matplotlib import pyplot as plt
pd.set_option("display.max_rows",15)
%matplotlib inline

In [2]:
conn = create_engine('sqlite:///../dataset/ijcai_cff_2017.db')

In [3]:
train_data_raw = pd.read_sql_query("select shop_id, \
(total_view + total_pay) as total_today, date(ts) as ts from train_data where ts > '2016-07-01'", 
                               con = conn)
train_data_raw

Unnamed: 0,shop_id,total_today,ts
0,1,268.0,2016-07-01
1,1,218.0,2016-07-02
2,1,224.0,2016-07-03
3,1,260.0,2016-07-04
4,1,267.0,2016-07-05
5,1,243.0,2016-07-06
6,1,272.0,2016-07-07
...,...,...,...
153911,2000,195.0,2016-10-25
153912,2000,197.0,2016-10-26


In [4]:
train_data = pd.read_sql_query("select shop_id, \
(total_view + total_pay) as input_foot_traffic from train_data where ts > '2016-06-01'", 
                               con = conn)

time_split_period = 15
time_periods_total = int(train_data.shape[0] / time_split_period) + 1
time_day_arr = np.tile(np.arange(0,15), time_periods_total)
time_period_arr = [np.full(shape=time_split_period, fill_value=i ) for i in range(time_periods_total)]
time_period_arr = np.concatenate(time_period_arr)
print (time_period_arr[0])

train_data["day"] = time_day_arr[0:train_data.shape[0]]
train_data["period"] = time_period_arr[0:train_data.shape[0]]
train_data['output_foot_traffic'] = train_data.input_foot_traffic.shift(-1)

train_data



0.0




Unnamed: 0,shop_id,input_foot_traffic,day,period,output_foot_traffic
0,1,,0,0.0,
1,1,,1,0.0,
2,1,,2,0.0,
3,1,,3,0.0,
4,1,,4,0.0,
5,1,,5,0.0,
6,1,,6,0.0,
...,...,...,...,...,...
187300,2000,195.0,10,12486.0,197.0
187301,2000,197.0,11,12486.0,148.0


In [5]:
train_data_p = train_data.pivot_table(index = ['shop_id', 'period'],
                       columns = 'day',
                       values = ['input_foot_traffic','output_foot_traffic']
                      )

In [6]:
train_data_p

Unnamed: 0_level_0,Unnamed: 1_level_0,input_foot_traffic,input_foot_traffic,input_foot_traffic,input_foot_traffic,input_foot_traffic,input_foot_traffic,input_foot_traffic,input_foot_traffic,input_foot_traffic,input_foot_traffic,...,output_foot_traffic,output_foot_traffic,output_foot_traffic,output_foot_traffic,output_foot_traffic,output_foot_traffic,output_foot_traffic,output_foot_traffic,output_foot_traffic,output_foot_traffic
Unnamed: 0_level_1,day,0,1,2,3,4,5,6,7,8,9,...,5,6,7,8,9,10,11,12,13,14
shop_id,period,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1,0.0,,,,,,,,,,,...,,,,,,,,,,
1,1.0,,,,,,,178.0,233.0,250.0,234.0,...,178.0,233.0,250.0,234.0,230.0,237.0,191.0,254.0,261.0,268.0
1,2.0,268.0,218.0,224.0,260.0,267.0,243.0,272.0,320.0,262.0,275.0,...,272.0,320.0,262.0,275.0,297.0,243.0,220.0,237.0,265.0,288.0
1,3.0,288.0,233.0,259.0,266.0,248.0,262.0,,270.0,256.0,,...,,270.0,256.0,,225.0,281.0,236.0,290.0,280.0,225.0
1,4.0,225.0,264.0,233.0,424.0,419.0,539.0,267.0,282.0,302.0,343.0,...,267.0,282.0,302.0,343.0,265.0,284.0,273.0,294.0,285.0,257.0
1,5.0,257.0,272.0,267.0,310.0,261.0,253.0,266.0,281.0,272.0,288.0,...,266.0,281.0,272.0,288.0,353.0,322.0,332.0,302.0,296.0,304.0
1,6.0,304.0,320.0,419.0,397.0,393.0,349.0,393.0,364.0,317.0,383.0,...,393.0,364.0,317.0,383.0,364.0,337.0,296.0,312.0,339.0,337.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2000,12481.0,124.0,134.0,153.0,102.0,143.0,43.0,39.0,190.0,131.0,185.0,...,39.0,190.0,131.0,185.0,207.0,163.0,80.0,63.0,188.0,216.0
2000,12482.0,216.0,169.0,184.0,161.0,53.0,76.0,177.0,195.0,187.0,168.0,...,177.0,195.0,187.0,168.0,171.0,98.0,130.0,259.0,212.0,210.0


In [7]:
train = train_data_p.dropna(how='any').reset_index().drop('period', axis=1)

  if __name__ == '__main__':


In [8]:
train.to_pickle("cff_15dayperiod.pkl")
train

Unnamed: 0_level_0,shop_id,input_foot_traffic,input_foot_traffic,input_foot_traffic,input_foot_traffic,input_foot_traffic,input_foot_traffic,input_foot_traffic,input_foot_traffic,input_foot_traffic,...,output_foot_traffic,output_foot_traffic,output_foot_traffic,output_foot_traffic,output_foot_traffic,output_foot_traffic,output_foot_traffic,output_foot_traffic,output_foot_traffic,output_foot_traffic
day,Unnamed: 1_level_1,0,1,2,3,4,5,6,7,8,...,5,6,7,8,9,10,11,12,13,14
0,1,268.0,218.0,224.0,260.0,267.0,243.0,272.0,320.0,262.0,...,272.0,320.0,262.0,275.0,297.0,243.0,220.0,237.0,265.0,288.0
1,1,225.0,264.0,233.0,424.0,419.0,539.0,267.0,282.0,302.0,...,267.0,282.0,302.0,343.0,265.0,284.0,273.0,294.0,285.0,257.0
2,1,257.0,272.0,267.0,310.0,261.0,253.0,266.0,281.0,272.0,...,266.0,281.0,272.0,288.0,353.0,322.0,332.0,302.0,296.0,304.0
3,1,304.0,320.0,419.0,397.0,393.0,349.0,393.0,364.0,317.0,...,393.0,364.0,317.0,383.0,364.0,337.0,296.0,312.0,339.0,337.0
4,1,337.0,238.0,352.0,339.0,356.0,230.0,254.0,226.0,242.0,...,254.0,226.0,242.0,272.0,297.0,258.0,250.0,254.0,194.0,206.0
5,1,206.0,336.0,349.0,255.0,327.0,312.0,291.0,275.0,294.0,...,291.0,275.0,294.0,283.0,291.0,270.0,299.0,281.0,254.0,279.0
6,1,279.0,294.0,284.0,232.0,267.0,206.0,231.0,250.0,229.0,...,231.0,250.0,229.0,268.0,235.0,276.0,195.0,250.0,278.0,283.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5460,1999,363.0,354.0,325.0,384.0,415.0,358.0,329.0,352.0,298.0,...,329.0,352.0,298.0,333.0,332.0,380.0,532.0,565.0,330.0,301.0
5461,2000,43.0,59.0,170.0,111.0,133.0,144.0,136.0,74.0,52.0,...,136.0,74.0,52.0,179.0,141.0,150.0,158.0,139.0,43.0,48.0
