Version III is mainly to change the cross-sectional averages to be market-wise, rather than global averages.
We also need to avoid the bleeding of the Weight and the response between the train and validation set.

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn import metrics, discriminant_analysis, preprocessing, linear_model, feature_selection, model_selection
from sklearn import decomposition, tree, utils, ensemble, neural_network, cluster
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
import time
import random
import pickle

In [2]:
# Set seed for reproducibility
np.random.seed(0)
# Imports data
trainval = pd.read_csv('train.csv', index_col = 0)
test = pd.read_csv('test.csv', index_col = 0) 
trainvaltest = trainval.append(test)
#trainvaltest = trainval.drop(['y', 'Weight'], axis = 1).append(test)
#trainvaltest.sort_values(['Market', 'Day', 'Stock'], inplace = True)

## Time-series features

Time-series features we want to build:
- 3 day, 10 day and 20 day average.
- Temporal and cross-sectional zscore.
- Difference of x4
- Beta of the stocks computed on the training set
- Days of the week
- Moving average of the response, NOT including the current's day response, for sufficiently large windows Fill with the median the missing values.

In [21]:
dat_time_ser = trainvaltest.set_index(['Day','Stock']).unstack()
dat_time_ser.head()

Unnamed: 0_level_0,Market,Market,Market,Market,Market,Market,Market,Market,Market,Market,...,y,y,y,y,y,y,y,y,y,y
Stock,0,1,2,3,4,5,6,7,8,9,...,3013,3014,3015,3016,3017,3018,3019,3020,3021,3022
Day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,,2.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,...,0.000122,-0.000205,7.987916e-06,,-8.5e-05,,2.6e-05,-8e-05,0.000608,-0.000262
2,2.0,,2.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,...,-0.000191,6.9e-05,-4.257532e-05,,0.000153,,1.8e-05,-0.001555,0.000595,0.000294
5,,,2.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,...,-0.000236,0.000302,1.908546e-07,,0.000218,,-4.4e-05,-0.000758,-0.00103,-0.000219
6,2.0,,2.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,...,4.2e-05,0.000385,6.049778e-05,,9e-06,,2e-05,-0.000676,0.000131,0.000192
7,2.0,,2.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,...,0.000136,0.000168,8.073224e-05,,-0.000189,,-2.8e-05,-0.000956,-7e-05,-0.000305


In [22]:
# Extracts the market/stock information
dat_time_ser.Market.fillna(method = 'bfill', inplace = True)
dat_time_ser.Market.fillna(method = 'ffill', inplace = True)
print dat_time_ser.Market.isnull().any().any()
market_by_stock = dat_time_ser.Market.iloc[0,:].astype('int32')
# Drops the non-time series features
#dat_time_ser.drop('Market', axis = 1, inplace = True)
dat_time_ser.head()

False


Unnamed: 0_level_0,Market,Market,Market,Market,Market,Market,Market,Market,Market,Market,...,y,y,y,y,y,y,y,y,y,y
Stock,0,1,2,3,4,5,6,7,8,9,...,3013,3014,3015,3016,3017,3018,3019,3020,3021,3022
Day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,2.0,2.0,2.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,...,0.000122,-0.000205,7.987916e-06,,-8.5e-05,,2.6e-05,-8e-05,0.000608,-0.000262
2,2.0,2.0,2.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,...,-0.000191,6.9e-05,-4.257532e-05,,0.000153,,1.8e-05,-0.001555,0.000595,0.000294
5,2.0,2.0,2.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,...,-0.000236,0.000302,1.908546e-07,,0.000218,,-4.4e-05,-0.000758,-0.00103,-0.000219
6,2.0,2.0,2.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,...,4.2e-05,0.000385,6.049778e-05,,9e-06,,2e-05,-0.000676,0.000131,0.000192
7,2.0,2.0,2.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,...,0.000136,0.000168,8.073224e-05,,-0.000189,,-2.8e-05,-0.000956,-7e-05,-0.000305


In [23]:
dat_time_ser = dat_time_ser.stack(level = 1)
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Market,Weight,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,x6,y
Day,Stock,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,2.0,,,,,,,,,,,,,
1,1,2.0,,,,,,,,,,,,,
1,2,2.0,94.206094,0.007902,1.077168,0.431896,4.542679e-08,1.44554e-07,2.94471e-07,5.886592e-07,1e-06,0.000213,1.881395e-08,295.844177,-8.4e-05
1,3,4.0,10.851268,0.13783,1.35157,0.844386,3.574793e-07,1.327104e-06,3.013744e-06,5.162016e-06,1.1e-05,0.000945,2.026576e-07,61.315306,-5.6e-05
1,4,3.0,6.587559,0.001531,0.658089,0.223429,2.982251e-07,8.578938e-07,1.556813e-06,2.639667e-06,5e-06,0.001272,1.35375e-07,121.818091,-2.9e-05


In [24]:
# We want to impute x4 using backward and forward filling, due to its integrated character. 
filled_forward = dat_time_ser.loc[:,'x4'].fillna(method = 'ffill')
filled_backward = dat_time_ser.loc[:,'x4'].fillna(method = 'bfill')
dat_time_ser.loc[:,'x4'] = 0.5*(filled_forward + filled_backward)
dat_time_ser.loc[:,'x4'].fillna(method = 'bfill', inplace = True)
dat_time_ser.loc[:,'x4'].fillna(method = 'ffill', inplace = True)
dat_time_ser.x4.head()

Day  Stock
1    0        0.000213
     1        0.000213
     2        0.000213
     3        0.000945
     4        0.001272
Name: x4, dtype: float64

In [25]:
dat_time_ser = dat_time_ser.groupby('Market').apply(lambda x: x.unstack(level = 0))
dat_time_ser = dat_time_ser.drop('Market', axis = 1)
#dat_time_ser.set_index('Market', append=True, inplace = True)
#dat_time_ser = dat_time_ser.unstack(level = 0)
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight,...,y,y,y,y,y,y,y,y,y,y
Unnamed: 0_level_1,Day,1,2,5,6,7,8,9,12,13,14,...,715,716,719,720,721,722,726,727,728,729
Market,Stock,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1.0,10,37.10981,41.34865,47.34935,50.181863,38.358476,38.748158,36.672643,52.161345,,,...,,7e-05,0.000142,6.4e-05,,,,,8.4e-05,0.0001
1.0,12,15.01485,13.158087,20.065627,17.494523,19.548395,23.542634,18.610592,19.535299,,,...,,6.2e-05,8.7e-05,0.000123,,,,,6.1e-05,9.7e-05
1.0,15,5.296184,5.131705,4.987554,11.013197,11.236866,8.254293,6.278038,6.255788,,,...,,0.000497,0.004777,0.007505,,,,,-0.000945,-0.000243
1.0,20,10.055895,11.822827,12.596766,13.202567,12.350436,16.853482,11.293013,14.809028,,,...,,1.2e-05,5.9e-05,0.000232,,,,,3.6e-05,0.00019
1.0,22,1.375876,0.589242,0.804319,0.308438,0.601397,1.375876,,1.375876,,,...,,,,,,,,,,


In [26]:
# Stores the weights and the responses, replaces the nans with the stock medians
store_weight_y = dat_time_ser.loc[:,['Weight', 'y']].reset_index(level = 0)
medians = store_weight_y.median(axis = 1, level = 0)
store_weight_y.Weight.fillna(medians.Weight, axis = 0, inplace = True)
store_weight_y.y.fillna(medians.y, axis = 0, inplace = True)
store_weight_y.head()

Unnamed: 0_level_0,Market,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight,...,y,y,y,y,y,y,y,y,y,y
Day,Unnamed: 1_level_1,1,2,5,6,7,8,9,12,13,...,715,716,719,720,721,722,726,727,728,729
Stock,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
10,1.0,37.10981,41.34865,47.34935,50.181863,38.358476,38.748158,36.672643,52.161345,1.37447,...,5.7e-05,7e-05,0.000142,6.4e-05,4e-06,-3.4e-05,0.000105,2.7e-05,8.4e-05,0.0001
12,1.0,15.01485,13.158087,20.065627,17.494523,19.548395,23.542634,18.610592,19.535299,1.37447,...,5.7e-05,6.2e-05,8.7e-05,0.000123,4e-06,-3.4e-05,0.000105,2.7e-05,6.1e-05,9.7e-05
15,1.0,5.296184,5.131705,4.987554,11.013197,11.236866,8.254293,6.278038,6.255788,1.37447,...,5.7e-05,0.000497,0.004777,0.007505,4e-06,-3.4e-05,0.000105,2.7e-05,-0.000945,-0.000243
20,1.0,10.055895,11.822827,12.596766,13.202567,12.350436,16.853482,11.293013,14.809028,1.37447,...,5.7e-05,1.2e-05,5.9e-05,0.000232,4e-06,-3.4e-05,0.000105,2.7e-05,3.6e-05,0.00019
22,1.0,1.375876,0.589242,0.804319,0.308438,0.601397,1.375876,11.574294,1.375876,1.37447,...,5.7e-05,1.3e-05,-6.5e-05,-2.3e-05,4e-06,-3.4e-05,0.000105,2.7e-05,0.000604,1.1e-05


In [27]:
# Resets the index 
store_weight_y = store_weight_y.set_index('Market', append=True)
store_weight_y = store_weight_y.set_index(dat_time_ser.index)
store_weight_y.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight,...,y,y,y,y,y,y,y,y,y,y
Unnamed: 0_level_1,Day,1,2,5,6,7,8,9,12,13,14,...,715,716,719,720,721,722,726,727,728,729
Market,Stock,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1.0,10,37.10981,41.34865,47.34935,50.181863,38.358476,38.748158,36.672643,52.161345,1.37447,10.937339,...,5.7e-05,7e-05,0.000142,6.4e-05,4e-06,-3.4e-05,0.000105,2.7e-05,8.4e-05,0.0001
1.0,12,15.01485,13.158087,20.065627,17.494523,19.548395,23.542634,18.610592,19.535299,1.37447,10.937339,...,5.7e-05,6.2e-05,8.7e-05,0.000123,4e-06,-3.4e-05,0.000105,2.7e-05,6.1e-05,9.7e-05
1.0,15,5.296184,5.131705,4.987554,11.013197,11.236866,8.254293,6.278038,6.255788,1.37447,10.937339,...,5.7e-05,0.000497,0.004777,0.007505,4e-06,-3.4e-05,0.000105,2.7e-05,-0.000945,-0.000243
1.0,20,10.055895,11.822827,12.596766,13.202567,12.350436,16.853482,11.293013,14.809028,1.37447,10.937339,...,5.7e-05,1.2e-05,5.9e-05,0.000232,4e-06,-3.4e-05,0.000105,2.7e-05,3.6e-05,0.00019
1.0,22,1.375876,0.589242,0.804319,0.308438,0.601397,1.375876,11.574294,1.375876,1.37447,10.937339,...,5.7e-05,1.3e-05,-6.5e-05,-2.3e-05,4e-06,-3.4e-05,0.000105,2.7e-05,0.000604,1.1e-05


In [28]:
# Drops y and Weight from the time series
dat_time_ser = dat_time_ser.drop(['Weight', 'y'], axis = 1)
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,x0,x0,x0,x0,x0,x0,x0,x0,x0,x0,...,x6,x6,x6,x6,x6,x6,x6,x6,x6,x6
Unnamed: 0_level_1,Day,1,2,5,6,7,8,9,12,13,14,...,715,716,719,720,721,722,726,727,728,729
Market,Stock,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1.0,10,0.015405,0.01205,0.013507,0.007336,0.001614,0.004875,0.009397,0.030015,0.018557,0.007122,...,260.406293,287.885556,291.285097,324.152142,383.24805,252.387601,,282.904197,323.967323,221.040778
1.0,12,0.005828,0.00615,0.014414,0.006769,0.014922,0.027731,0.011221,0.010253,0.002973,0.010413,...,212.457377,201.390841,199.81629,199.325062,192.695489,169.458138,,180.358456,195.619229,155.401912
1.0,15,0.000847,0.008434,0.002629,0.006751,0.007008,0.017706,0.003068,0.02519,0.012677,0.002342,...,1263.036548,500.0,500.0,500.0,500.0,500.0,,500.0,500.0,500.0
1.0,20,0.009568,0.008282,0.003071,0.006059,0.028286,0.023358,0.017426,0.036234,0.009732,0.035721,...,214.215736,232.603795,201.383264,181.720845,167.664915,141.283865,,107.536989,206.184317,160.623067
1.0,22,0.060166,0.011263,0.231143,0.199116,0.11493,0.775825,,0.191738,0.177439,0.163638,...,,,,,,,,,,


In [29]:
# Replace the nans and the values smaller than zero by the cross sectional median
medians = dat_time_ser.median(axis = 0, level = 0)
dat_time_ser[dat_time_ser.isnull()] = medians
dat_time_ser[dat_time_ser <= 0] = medians
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,x0,x0,x0,x0,x0,x0,x0,x0,x0,x0,...,x6,x6,x6,x6,x6,x6,x6,x6,x6,x6
Unnamed: 0_level_1,Day,1,2,5,6,7,8,9,12,13,14,...,715,716,719,720,721,722,726,727,728,729
Market,Stock,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
1.0,10,0.015405,0.01205,0.013507,0.007336,0.001614,0.004875,0.009397,0.030015,0.018557,0.007122,...,260.406293,287.885556,291.285097,324.152142,383.24805,252.387601,,282.904197,323.967323,221.040778
1.0,12,0.005828,0.00615,0.014414,0.006769,0.014922,0.027731,0.011221,0.010253,0.002973,0.010413,...,212.457377,201.390841,199.81629,199.325062,192.695489,169.458138,,180.358456,195.619229,155.401912
1.0,15,0.000847,0.008434,0.002629,0.006751,0.007008,0.017706,0.003068,0.02519,0.012677,0.002342,...,1263.036548,500.0,500.0,500.0,500.0,500.0,,500.0,500.0,500.0
1.0,20,0.009568,0.008282,0.003071,0.006059,0.028286,0.023358,0.017426,0.036234,0.009732,0.035721,...,214.215736,232.603795,201.383264,181.720845,167.664915,141.283865,,107.536989,206.184317,160.623067
1.0,22,0.060166,0.011263,0.231143,0.199116,0.11493,0.775825,0.002962,0.191738,0.177439,0.163638,...,166.731568,178.585027,156.708726,146.732123,157.973583,100.0,,125.120379,121.730996,114.045506


In [30]:
# Logs the features
dat_time_ser = dat_time_ser.stack(level = 1).apply(np.log)
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,x6
Market,Stock,Day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1.0,10,1,-4.173071,-0.604414,-0.707595,-16.822898,-15.768382,-15.040206,-14.46108,-13.775494,-8.154655,-17.169408,5.327721
1.0,10,2,-4.41868,-0.469946,-0.757369,-17.384426,-16.218742,-15.486009,-14.693636,-13.780887,-8.174265,-17.846112,5.447034
1.0,10,5,-4.304582,-0.623238,-1.08243,-17.466539,-16.316185,-15.403125,-14.924859,-13.859143,-8.207112,-16.247488,5.111433
1.0,10,6,-4.914936,-0.347112,-0.66099,-17.088707,-16.029153,-15.274344,-14.608432,-13.668939,-8.160224,-17.150688,5.59616
1.0,10,7,-6.428771,-0.31957,-0.606628,-17.105795,-15.944975,-15.193404,-14.562069,-13.550648,-8.131905,-17.611517,5.614153


In [31]:
"""dat_time_ser = pd.concat([dat_time_ser, store_weight_y.stack()], axis = 1)
dat_time_ser.head()"""

'dat_time_ser = pd.concat([dat_time_ser, store_weight_y.stack()], axis = 1)\ndat_time_ser.head()'

### Difference of x4

In [32]:
dat_time_ser['x4dp'] = dat_time_ser.x4.groupby('Stock').diff().fillna(0.0)
dat_time_ser['x4dm'] = dat_time_ser.x4.groupby('Stock').diff(periods = -1).fillna(0.0)
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,x6,x4dp,x4dm
Market,Stock,Day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1.0,10,1,-4.173071,-0.604414,-0.707595,-16.822898,-15.768382,-15.040206,-14.46108,-13.775494,-8.154655,-17.169408,5.327721,0.0,0.01961
1.0,10,2,-4.41868,-0.469946,-0.757369,-17.384426,-16.218742,-15.486009,-14.693636,-13.780887,-8.174265,-17.846112,5.447034,-0.01961,0.032847
1.0,10,5,-4.304582,-0.623238,-1.08243,-17.466539,-16.316185,-15.403125,-14.924859,-13.859143,-8.207112,-16.247488,5.111433,-0.032847,-0.046888
1.0,10,6,-4.914936,-0.347112,-0.66099,-17.088707,-16.029153,-15.274344,-14.608432,-13.668939,-8.160224,-17.150688,5.59616,0.046888,-0.028319
1.0,10,7,-6.428771,-0.31957,-0.606628,-17.105795,-15.944975,-15.193404,-14.562069,-13.550648,-8.131905,-17.611517,5.614153,0.028319,-0.199014


### Response and weight moving averages, excluding the current day's response.

In [33]:
store_weight_y_ma = store_weight_y.stack().unstack(level = [0,1])
store_weight_y_ma.head()

Unnamed: 0_level_0,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight,...,y,y,y,y,y,y,y,y,y,y
Market,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
Stock,10,12,15,20,22,29,32,34,42,48,...,2925,2927,2937,2946,2949,2970,2976,2978,3010,3021
Day,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
1,37.10981,15.01485,5.296184,10.055895,1.375876,2.604613,2.527293,2.527293,2.508284,8.763009,...,-7.6e-05,-0.000956,-0.00022,0.000294,0.000294,0.000294,0.000449,0.000294,0.000256,0.000608
2,41.34865,13.158087,5.131705,11.822827,0.589242,3.976764,93.774632,93.774632,0.731633,10.362877,...,-0.000284,0.002578,-0.000374,-6.4e-05,-6.4e-05,-6.4e-05,0.000861,-6.4e-05,0.000415,0.000595
5,47.34935,20.065627,4.987554,12.596766,0.804319,2.507356,0.77619,0.77619,1.0337,13.349002,...,-9.5e-05,0.00063,-0.000161,0.001186,0.001186,0.001186,0.006186,0.001186,0.000391,-0.00103
6,50.181863,17.494523,11.013197,13.202567,0.308438,2.937648,54.82146,54.82146,0.99016,20.13938,...,-0.000138,0.001768,-7e-05,1.6e-05,1.6e-05,1.6e-05,0.001114,1.6e-05,2.4e-05,0.000131
7,38.358476,19.548395,11.236866,12.350436,0.601397,4.247283,26.605842,26.605842,0.702241,13.087248,...,-0.000132,0.004117,-0.000233,3.4e-05,3.4e-05,3.4e-05,0.00067,3.4e-05,0.0004,-7e-05


In [35]:
# Logs the weight
store_weight_y_ma.Weight = np.log(store_weight_y_ma.Weight)
store_weight_y_ma.head()

Unnamed: 0_level_0,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight,...,y,y,y,y,y,y,y,y,y,y
Market,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
Stock,10,12,15,20,22,29,32,34,42,48,...,2925,2927,2937,2946,2949,2970,2976,2978,3010,3021
Day,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
1,3.613881,2.70904,1.666986,2.308159,0.319091,0.957284,0.927149,0.927149,0.919599,2.170539,...,-7.6e-05,-0.000956,-0.00022,0.000294,0.000294,0.000294,0.000449,0.000294,0.000256,0.000608
2,3.72204,2.577037,1.635438,2.470032,-0.528919,1.380468,4.540894,4.540894,-0.312477,2.33823,...,-0.000284,0.002578,-0.000374,-6.4e-05,-6.4e-05,-6.4e-05,0.000861,-6.4e-05,0.000415,0.000595
5,3.857553,2.999008,1.606946,2.53344,-0.21776,0.919229,-0.253358,-0.253358,0.033145,2.591442,...,-9.5e-05,0.00063,-0.000161,0.001186,0.001186,0.001186,0.006186,0.001186,0.000391,-0.00103
6,3.915654,2.861888,2.399094,2.580411,-1.176234,1.077609,4.004082,4.004082,-0.009888,3.002677,...,-0.000138,0.001768,-7e-05,1.6e-05,1.6e-05,1.6e-05,0.001114,1.6e-05,2.4e-05,0.000131
7,3.646976,2.972893,2.4192,2.513691,-0.5085,1.446279,3.281131,3.281131,-0.353479,2.571638,...,-0.000132,0.004117,-0.000233,3.4e-05,3.4e-05,3.4e-05,0.00067,3.4e-05,0.0004,-7e-05


In [38]:
# The shift is crucial to avoid leaking the current response and weight in the features
maWy10p = store_weight_y_ma.rolling(window = 10).mean().shift(1)
maWy10p.apply(lambda col: col.fillna(col.median(), inplace = True), axis = 0)
maWy10p = maWy10p.stack(level = [1,2])
maWy10p.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Weight,y
Day,Market,Stock,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,10,2.694243,7.6e-05
1,1.0,12,2.480752,6.7e-05
1,1.0,15,1.78767,0.000128
1,1.0,20,2.117277,7.4e-05
1,1.0,22,1.150932,5.5e-05


In [40]:
maWy10m = store_weight_y_ma.loc[::-1, :].rolling(window = 10).mean().shift(1).loc[::-1, :]
maWy10m.apply(lambda col: col.fillna(col.median(), inplace = True), axis = 0)
maWy10m = maWy10m.stack(level = [1,2])
maWy10m.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Weight,y
Day,Market,Stock,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,10,3.104281,5.2e-05
1,1.0,12,2.515273,-2.1e-05
1,1.0,15,1.852911,1.6e-05
1,1.0,20,2.272869,3.1e-05
1,1.0,22,0.534269,-5.3e-05


In [41]:
maWy20p = store_weight_y_ma.rolling(window = 20).mean().shift(1)
maWy20p.apply(lambda col: col.fillna(col.median(), inplace = True), axis = 0)
maWy20p = maWy20p.stack(level = [1,2])
maWy20p.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Weight,y
Day,Market,Stock,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,10,2.691161,7.9e-05
1,1.0,12,2.48069,7.5e-05
1,1.0,15,1.811644,0.000135
1,1.0,20,2.126612,7.6e-05
1,1.0,22,1.133931,6.5e-05


In [42]:
maWy20m = store_weight_y_ma.loc[::-1, :].rolling(window = 20).mean().shift(1).loc[::-1, :]
maWy20m.apply(lambda col: col.fillna(col.median(), inplace = True), axis = 0)
maWy20m = maWy20m.stack(level = [1,2])
maWy20m.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Weight,y
Day,Market,Stock,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,10,2.935098,9.3e-05
1,1.0,12,2.423287,6.6e-05
1,1.0,15,1.910434,0.000127
1,1.0,20,2.2081,8e-05
1,1.0,22,0.864267,2.2e-05


In [43]:
maWy40p = store_weight_y_ma.rolling(window = 40).mean().shift(1)
maWy40p.apply(lambda col: col.fillna(col.median(), inplace = True), axis = 0)
maWy40p = maWy40p.stack(level = [1,2])
maWy40p.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Weight,y
Day,Market,Stock,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,10,2.700884,8.3e-05
1,1.0,12,2.477155,8e-05
1,1.0,15,1.843086,0.000141
1,1.0,20,2.113346,7.3e-05
1,1.0,22,1.19172,3.3e-05


In [44]:
maWy40m = store_weight_y_ma.loc[::-1, :].rolling(window = 40).mean().shift(1).loc[::-1, :]
maWy40m.apply(lambda col: col.fillna(col.median(), inplace = True), axis = 0)
maWy40m = maWy40m.stack(level = [1,2])
maWy40m.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Weight,y
Day,Market,Stock,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,10,2.746824,0.000295
1,1.0,12,2.322506,0.000293
1,1.0,15,1.862669,0.000335
1,1.0,20,2.048508,0.000288
1,1.0,22,0.633811,0.000332


### Temporal and cross-sectional (robust) z-scores

In [45]:
dat_time_ser_rest = dat_time_ser#.drop(['x3A', 'x3B', 'x3C', 'x3D', 'x4dp', 'x4dm','Weight','y'], axis = 1)
temporal_zscore = np.divide(dat_time_ser_rest.groupby('Stock').quantile(q = 0.5, axis = 0),
                            (dat_time_ser_rest.groupby('Stock').quantile(q = 0.75, axis = 0) - \
                             dat_time_ser_rest.groupby('Stock').quantile(q = 0.25, axis = 0) + .1))
#temporal_zscore.drop(['Weight','y'], axis = 1, inplace = True)
temporal_zscore.head()

0.5,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,x6,x4dp,x4dm
Stock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,-3.371737,0.445222,-1.149705,-3.412466,-3.785514,-4.140206,-4.801077,-6.200626,-2.218948,-2.522261,11.106353,0.0,0.0
1,-11.262339,1.006219,-2.26666,-25.330481,-25.530832,-24.859115,-25.92847,-24.120546,-3.139791,-25.986271,26.423019,0.002592,-0.002592
2,-2.703689,1.122333,-0.77832,-32.806612,-29.534019,-27.835014,-26.829462,-24.664048,-29.765259,-22.626759,14.91166,0.00043,-0.00043
3,-1.906456,0.704663,-0.53687,-21.386213,-21.746491,-20.110259,-17.21669,-12.848902,-24.96632,-27.785887,11.973229,0.000774,-0.000774
4,-3.595575,-0.401526,-2.280502,-20.601851,-20.070545,-18.395708,-15.151109,-11.802252,-17.14556,-14.739919,46.051702,0.0,0.0


In [46]:
crosssect_zscore = np.divide(dat_time_ser_rest.groupby(['Day', 'Market']).quantile(q = 0.5, axis = 0),
                            (dat_time_ser_rest.groupby(['Day', 'Market']).quantile(q = 0.75, axis = 0) - \
                             dat_time_ser_rest.groupby(['Day', 'Market']).quantile(q = 0.25, axis = 0) + .1))
#crosssect_zscore.drop(['Weight','y'], axis = 1, inplace = True)
crosssect_zscore.head()

Unnamed: 0_level_0,0.5,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,x6,x4dp,x4dm
Day,Market,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1.0,-2.4441,-1.058218,-1.06326,-8.351639,-8.610257,-8.481755,-8.139073,-8.805337,-4.536242,-7.850662,5.336387,0.0,0.038021
1,2.0,-3.761807,0.484059,-0.435237,-8.465572,-9.801897,-10.192717,-9.921393,-9.867651,-3.954504,-5.794738,5.797247,0.0,-0.026104
1,3.0,-2.049663,0.200831,-0.206597,-8.712979,-8.788469,-8.835433,-8.189879,-7.992245,-5.238446,-7.932177,5.806199,0.0,0.05519
1,4.0,-2.72875,1.07464,-0.48155,-8.920501,-9.188659,-9.111736,-9.088547,-10.957661,-4.132708,-8.007711,5.376737,0.0,0.07666
2,1.0,-3.207247,-1.101103,-0.920506,-8.980738,-8.546057,-8.48979,-8.060831,-7.66163,-4.380699,-7.51879,4.645397,-0.038021,0.0


### Beta of the stocks, assuming y is a return

In [47]:
ret_by_stocks = store_weight_y.y
ret_by_stocks.head()

Unnamed: 0_level_0,Day,1,2,5,6,7,8,9,12,13,14,...,715,716,719,720,721,722,726,727,728,729
Market,Stock,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1.0,10,0.000101,7.5e-05,8e-05,5e-05,6.4e-05,1.569282e-05,6.4e-05,3.5e-05,4e-05,-2.3e-05,...,5.7e-05,7e-05,0.000142,6.4e-05,4e-06,-3.4e-05,0.000105,2.7e-05,8.4e-05,0.0001
1.0,12,-2.8e-05,2.8e-05,-8.6e-05,-4.8e-05,-5.3e-05,-5.005148e-05,-0.000127,-1.2e-05,4e-05,-2.3e-05,...,5.7e-05,6.2e-05,8.7e-05,0.000123,4e-06,-3.4e-05,0.000105,2.7e-05,6.1e-05,9.7e-05
1.0,15,0.000295,0.000419,7.3e-05,-5.4e-05,2.7e-05,-3.676836e-05,-4e-05,-0.000362,4e-05,-2.3e-05,...,5.7e-05,0.000497,0.004777,0.007505,4e-06,-3.4e-05,0.000105,2.7e-05,-0.000945,-0.000243
1.0,20,0.000272,-1.7e-05,4.4e-05,8.5e-05,-6e-06,1.017337e-07,-1.3e-05,7.5e-05,4e-05,-2.3e-05,...,5.7e-05,1.2e-05,5.9e-05,0.000232,4e-06,-3.4e-05,0.000105,2.7e-05,3.6e-05,0.00019
1.0,22,0.002172,0.00034,0.000532,0.002465,-0.002195,0.0001777248,2e-05,-0.00201,4e-05,-2.3e-05,...,5.7e-05,1.3e-05,-6.5e-05,-2.3e-05,4e-06,-3.4e-05,0.000105,2.7e-05,0.000604,1.1e-05


In [48]:
market_ret = ret_by_stocks.mean(axis = 0)
beta = ret_by_stocks.corrwith(market_ret, axis = 1)
beta[beta.isnull()] = beta.median()
beta.head()

Market  Stock
1.0     10       0.984084
        12       0.985276
        15       0.400109
        20       0.962733
        22       0.339055
dtype: float64

### Moving means/medians over 3, 10 and 20 days

In [49]:
dat_time_ser_ma = dat_time_ser.unstack(level = [0,1])
dat_time_ser_ma.head()

Unnamed: 0_level_0,x0,x0,x0,x0,x0,x0,x0,x0,x0,x0,...,x4dm,x4dm,x4dm,x4dm,x4dm,x4dm,x4dm,x4dm,x4dm,x4dm
Market,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
Stock,10,12,15,20,22,29,32,34,42,48,...,2925,2927,2937,2946,2949,2970,2976,2978,3010,3021
Day,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
1,-4.173071,-5.145121,-7.073798,-4.649299,-2.810641,-6.286277,-5.549871,-5.549871,-8.427428,-3.38557,...,0.159364,0.136227,0.335034,0.075087,0.055556,0.098977,0.140901,0.425349,0.271812,0.428318
2,-4.41868,-5.091345,-4.775487,-4.793614,-4.486232,-6.319236,-5.892401,-5.892401,-7.830081,-4.947875,...,-0.120573,-0.050857,-0.259914,0.089663,0.047056,-0.246349,-0.108352,-0.794743,-0.216398,0.141474
5,-4.304582,-4.239538,-5.941207,-5.785892,-1.46472,-8.559716,-5.778808,-5.778808,-7.177552,-3.473304,...,0.240179,-0.400632,-0.028127,0.108339,0.035092,0.714957,0.056203,-0.444948,0.195747,0.099706
6,-4.914936,-4.995354,-4.99803,-5.106227,-1.613869,-8.816637,-5.70336,-5.70336,-7.20332,-4.612884,...,-0.064994,0.413734,0.167851,-0.278258,-0.175892,0.072034,0.635055,0.654301,-0.092928,0.02134
7,-6.428771,-4.204915,-4.960672,-3.565387,-2.163433,-6.394405,-5.860735,-5.860735,-8.266501,-1.270251,...,0.118286,0.012639,-0.025809,-0.467947,-0.766607,-0.31071,-0.850983,-0.270963,0.479803,-0.332362


In [50]:
ma3p = dat_time_ser_ma.rolling(window = 3).median()
ma3p.x4.fillna(method = 'bfill', inplace = True)
ma3p.apply(lambda col: col.fillna(col.median(), inplace = True), axis = 0)
ma3p = ma3p.stack(level = [1,2])
ma3p.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x4dm,x4dp,x5,x6
Day,Market,Stock,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1.0,10,-3.767075,-0.610031,-0.9905,-16.120921,-14.881511,-14.074383,-13.335174,-12.488028,-7.935925,0.000594,-0.000594,-16.574868,5.334518
1,1.0,12,-3.684104,-0.56396,-0.865825,-16.098026,-14.841465,-14.048966,-13.345301,-12.482659,-7.915593,0.001004,-0.001006,-16.107795,5.162323
1,1.0,15,-5.348332,-0.563726,-1.005988,-13.381208,-12.235496,-11.47631,-10.736201,-9.95647,-6.303032,-0.000654,0.000654,-14.288756,5.360661
1,1.0,20,-3.728913,-0.36478,-0.696935,-15.330522,-14.065793,-13.325094,-12.645989,-11.939235,-7.589947,-0.004055,0.004055,-15.42729,5.052703
1,1.0,22,-3.030118,-0.450515,-0.52235,-12.830713,-12.292969,-11.779158,-11.391901,-11.189172,-5.662103,0.0,0.0,-12.196305,4.60517


In [51]:
ma3m = dat_time_ser_ma.loc[::-1, :].rolling(window = 3).median().loc[::-1, :]
ma3m.x4.fillna(method = 'ffill', inplace = True)
ma3m.apply(lambda col: col.fillna(col.median(), inplace = True), axis = 0)
ma3m = ma3m.stack(level = [1,2])
ma3m.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x4dm,x4dp,x5,x6
Day,Market,Stock,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1.0,10,-4.304582,-0.604414,-0.757369,-17.384426,-16.218742,-15.403125,-14.693636,-13.780887,-8.174265,0.01961,-0.01961,-17.169408,5.327721
1,1.0,12,-5.091345,-0.746521,-0.936594,-16.10744,-15.051205,-14.382111,-13.740117,-12.725538,-7.9105,0.004729,-0.004729,-17.306205,5.012012
1,1.0,15,-5.941207,-0.71028,-1.00589,-14.56539,-13.556944,-12.910377,-12.24953,-11.442146,-6.746488,0.062846,0.0,-15.796174,4.997866
1,1.0,20,-4.793614,-0.576073,-0.442756,-15.703755,-14.621472,-13.862142,-13.233748,-12.53834,-7.421083,0.066515,0.0,-16.353977,4.918289
1,1.0,22,-2.810641,-0.287523,-0.48501,-12.754156,-12.491143,-12.765095,-12.476811,-12.476811,-4.913014,-0.172356,0.141651,-11.41994,4.60517


In [52]:
ma10p = dat_time_ser_ma.rolling(window = 10).median()
ma10p.x4.fillna(method = 'bfill', inplace = True)
ma10p.apply(lambda col: col.fillna(col.median(), inplace = True), axis = 0)
ma10p = ma10p.stack(level = [1,2])
ma10p.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x4dm,x4dp,x5,x6
Day,Market,Stock,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1.0,10,-3.749143,-0.608005,-0.988152,-16.070853,-14.788179,-13.969688,-13.266309,-12.417454,-7.936621,0.000587,-0.000587,-16.596272,5.318689
1,1.0,12,-3.674508,-0.560636,-0.857165,-16.089741,-14.822624,-14.027247,-13.283156,-12.399047,-7.908422,0.001,-0.001,-16.135555,5.14453
1,1.0,15,-5.276553,-0.575513,-1.012437,-13.248206,-12.143267,-11.357585,-10.688478,-9.852727,-6.30178,-0.001804,0.001804,-14.197066,5.33828
1,1.0,20,-3.766967,-0.347922,-0.694661,-15.334226,-14.045613,-13.299729,-12.598067,-11.891316,-7.607112,-0.009114,0.009114,-15.413748,5.084783
1,1.0,22,-2.711858,-0.418877,-0.481877,-12.454187,-12.245864,-11.761289,-11.454482,-11.280798,-5.647074,0.0,0.0,-11.935012,4.60517


In [53]:
ma10m = dat_time_ser_ma.loc[::-1, :].rolling(window = 10).median().loc[::-1, :]
ma10m.x4.fillna(method = 'ffill', inplace = True)
ma10m.apply(lambda col: col.fillna(col.median(), inplace = True), axis = 0)
ma10m = ma10m.stack(level = [1,2])
ma10m.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x4dm,x4dp,x5,x6
Day,Market,Stock,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1.0,10,-4.543004,-0.577502,-0.896007,-17.151527,-16.014834,-15.267944,-14.625601,-13.75735,-8.190689,-0.00688,0.003281,-17.621529,5.272931
1,1.0,12,-4.572423,-0.730783,-1.033785,-16.195898,-15.026373,-14.312306,-13.625037,-12.750846,-8.114319,0.001182,-0.001182,-17.312087,5.011023
1,1.0,15,-4.979351,-0.721684,-1.022646,-14.70996,-13.561492,-12.822882,-12.071633,-11.036832,-7.117375,0.014403,-0.012716,-15.771345,5.13941
1,1.0,20,-4.341094,-0.564727,-0.534184,-15.8092,-14.633259,-13.86914,-13.210438,-12.378203,-7.942686,0.014634,-0.014634,-16.385704,4.895144
1,1.0,22,-1.769614,-0.388435,-0.408715,-12.053336,-11.52706,-11.468907,-11.468907,-11.390281,-4.974868,-0.070826,0.070826,-11.424758,4.60517


In [54]:
ma20p = dat_time_ser_ma.rolling(window = 20).median()
ma20p.x4.fillna(method = 'bfill', inplace = True)
ma20p.apply(lambda col: col.fillna(col.median(), inplace = True), axis = 0)
ma20p = ma20p.stack(level = [1,2])
ma20p.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x4dm,x4dp,x5,x6
Day,Market,Stock,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1.0,10,-3.687377,-0.614919,-1.015332,-15.962516,-14.702412,-13.949666,-13.238376,-12.313196,-7.914428,0.000135,-0.000135,-16.539622,5.31928
1,1.0,12,-3.670296,-0.554463,-0.868733,-16.090438,-14.81646,-14.014457,-13.249261,-12.340425,-7.873858,0.001478,-0.001021,-16.172779,5.159791
1,1.0,15,-5.255827,-0.576191,-0.997183,-13.15649,-12.042352,-11.337175,-10.637023,-9.867836,-6.315144,-0.001595,0.001595,-14.127112,5.51522
1,1.0,20,-3.632411,-0.293089,-0.687789,-15.253266,-13.956661,-13.262466,-12.545715,-11.802381,-7.609337,-0.006309,0.006309,-15.360238,5.095924
1,1.0,22,-2.711858,-0.39773,-0.414126,-12.512524,-12.287217,-11.813394,-11.458459,-11.284732,-5.640695,0.0,0.0,-11.849233,4.60517


In [55]:
ma20m = dat_time_ser_ma.loc[::-1, :].rolling(window = 20).median().loc[::-1, :]
ma20m.x4.fillna(method = 'ffill', inplace = True)
ma20m.apply(lambda col: col.fillna(col.median(), inplace = True), axis = 0)
ma20m = ma20m.stack(level = [1,2])
ma20m.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x4dm,x4dp,x5,x6
Day,Market,Stock,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1.0,10,-4.614101,-0.602813,-0.941601,-17.097251,-15.984399,-15.217275,-14.531859,-13.619579,-8.271525,-0.003868,0.000677,-17.395207,5.270586
1,1.0,12,-4.572423,-0.765436,-1.016663,-16.202627,-15.026373,-14.273943,-13.625037,-12.782926,-8.215867,0.001021,-0.001021,-17.209777,4.970574
1,1.0,15,-5.16561,-0.740134,-0.889023,-14.554078,-13.337477,-12.521921,-11.968805,-11.106735,-7.152239,0.001599,-0.001599,-15.458808,5.112083
1,1.0,20,-4.143383,-0.627638,-0.666401,-15.843276,-14.676084,-13.916837,-13.22393,-12.498595,-7.950358,-0.018127,-0.006013,-16.178144,4.892015
1,1.0,22,-1.783453,-0.484973,-0.503575,-11.877787,-11.409133,-11.659341,-11.468907,-11.431036,-4.974868,0.0,0.0,-11.353582,4.60517


In [56]:
list_ma = [ma3p, ma3m, ma10p, ma10m, ma20p, ma20m]
with open('mov_av.pickle', 'wb') as f:
    pickle.dump(list_ma, f)

In [20]:
with open('mov_av.pickle', 'rb') as f:
    list_ma = pickle.load(f)

In [21]:
ma3p = list_ma[0]
ma3m = list_ma[1]
ma10p = list_ma[2]
ma10m = list_ma[3]
ma20p = list_ma[4]
ma20m = list_ma[5]

### Imputation into dat_time_ser

In [57]:
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,x6,x4dp,x4dm
Market,Stock,Day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1.0,10,1,-4.173071,-0.604414,-0.707595,-16.822898,-15.768382,-15.040206,-14.46108,-13.775494,-8.154655,-17.169408,5.327721,0.0,0.01961
1.0,10,2,-4.41868,-0.469946,-0.757369,-17.384426,-16.218742,-15.486009,-14.693636,-13.780887,-8.174265,-17.846112,5.447034,-0.01961,0.032847
1.0,10,5,-4.304582,-0.623238,-1.08243,-17.466539,-16.316185,-15.403125,-14.924859,-13.859143,-8.207112,-16.247488,5.111433,-0.032847,-0.046888
1.0,10,6,-4.914936,-0.347112,-0.66099,-17.088707,-16.029153,-15.274344,-14.608432,-13.668939,-8.160224,-17.150688,5.59616,0.046888,-0.028319
1.0,10,7,-6.428771,-0.31957,-0.606628,-17.105795,-15.944975,-15.193404,-14.562069,-13.550648,-8.131905,-17.611517,5.614153,0.028319,-0.199014


In [58]:
onehot_markets = pd.get_dummies(market_by_stock)
onehot_markets.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            3013, 3014, 3015, 3016, 3017, 3018, 3019, 3020, 3021, 3022],
           dtype='int64', name=u'Stock', length=3023)

In [59]:
# Imputes the one-hot encoded market
onehot_markets = pd.get_dummies(market_by_stock, columns = ['m1','m2','m3','m4'])
for col in onehot_markets:
    new_col = "m" + str(col)
    dat_time_ser[new_col] = onehot_markets.loc[:,col].reindex(dat_time_ser.index, level = 1)
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,x6,x4dp,x4dm,m1,m2,m3,m4
Market,Stock,Day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1.0,10,1,-4.173071,-0.604414,-0.707595,-16.822898,-15.768382,-15.040206,-14.46108,-13.775494,-8.154655,-17.169408,5.327721,0.0,0.01961,1,0,0,0
1.0,10,2,-4.41868,-0.469946,-0.757369,-17.384426,-16.218742,-15.486009,-14.693636,-13.780887,-8.174265,-17.846112,5.447034,-0.01961,0.032847,1,0,0,0
1.0,10,5,-4.304582,-0.623238,-1.08243,-17.466539,-16.316185,-15.403125,-14.924859,-13.859143,-8.207112,-16.247488,5.111433,-0.032847,-0.046888,1,0,0,0
1.0,10,6,-4.914936,-0.347112,-0.66099,-17.088707,-16.029153,-15.274344,-14.608432,-13.668939,-8.160224,-17.150688,5.59616,0.046888,-0.028319,1,0,0,0
1.0,10,7,-6.428771,-0.31957,-0.606628,-17.105795,-15.944975,-15.193404,-14.562069,-13.550648,-8.131905,-17.611517,5.614153,0.028319,-0.199014,1,0,0,0


In [60]:
# Imputes the temporal zscores into dat_time_ser
for col in temporal_zscore.columns:
    new_col = col + '_tzs'
    dat_time_ser[new_col] = temporal_zscore.loc[:,col].reindex(dat_time_ser.index, level = 1)
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,...,x3A_tzs,x3B_tzs,x3C_tzs,x3D_tzs,x3E_tzs,x4_tzs,x5_tzs,x6_tzs,x4dp_tzs,x4dm_tzs
Market,Stock,Day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1.0,10,1,-4.173071,-0.604414,-0.707595,-16.822898,-15.768382,-15.040206,-14.46108,-13.775494,-8.154655,-17.169408,...,-11.654187,-10.158894,-9.770431,-9.07707,-8.128829,-9.66031,-13.487442,14.80933,-0.00207,0.00207
1.0,10,2,-4.41868,-0.469946,-0.757369,-17.384426,-16.218742,-15.486009,-14.693636,-13.780887,-8.174265,-17.846112,...,-11.654187,-10.158894,-9.770431,-9.07707,-8.128829,-9.66031,-13.487442,14.80933,-0.00207,0.00207
1.0,10,5,-4.304582,-0.623238,-1.08243,-17.466539,-16.316185,-15.403125,-14.924859,-13.859143,-8.207112,-16.247488,...,-11.654187,-10.158894,-9.770431,-9.07707,-8.128829,-9.66031,-13.487442,14.80933,-0.00207,0.00207
1.0,10,6,-4.914936,-0.347112,-0.66099,-17.088707,-16.029153,-15.274344,-14.608432,-13.668939,-8.160224,-17.150688,...,-11.654187,-10.158894,-9.770431,-9.07707,-8.128829,-9.66031,-13.487442,14.80933,-0.00207,0.00207
1.0,10,7,-6.428771,-0.31957,-0.606628,-17.105795,-15.944975,-15.193404,-14.562069,-13.550648,-8.131905,-17.611517,...,-11.654187,-10.158894,-9.770431,-9.07707,-8.128829,-9.66031,-13.487442,14.80933,-0.00207,0.00207


In [61]:
dat_time_ser_res = dat_time_ser.reset_index(level = 0)
crosssect_zscore_res = crosssect_zscore.reset_index(level = 1)
dat_time_ser_res.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Market,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,...,x3A_tzs,x3B_tzs,x3C_tzs,x3D_tzs,x3E_tzs,x4_tzs,x5_tzs,x6_tzs,x4dp_tzs,x4dm_tzs
Stock,Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10,1,1.0,-4.173071,-0.604414,-0.707595,-16.822898,-15.768382,-15.040206,-14.46108,-13.775494,-8.154655,...,-11.654187,-10.158894,-9.770431,-9.07707,-8.128829,-9.66031,-13.487442,14.80933,-0.00207,0.00207
10,2,1.0,-4.41868,-0.469946,-0.757369,-17.384426,-16.218742,-15.486009,-14.693636,-13.780887,-8.174265,...,-11.654187,-10.158894,-9.770431,-9.07707,-8.128829,-9.66031,-13.487442,14.80933,-0.00207,0.00207
10,5,1.0,-4.304582,-0.623238,-1.08243,-17.466539,-16.316185,-15.403125,-14.924859,-13.859143,-8.207112,...,-11.654187,-10.158894,-9.770431,-9.07707,-8.128829,-9.66031,-13.487442,14.80933,-0.00207,0.00207
10,6,1.0,-4.914936,-0.347112,-0.66099,-17.088707,-16.029153,-15.274344,-14.608432,-13.668939,-8.160224,...,-11.654187,-10.158894,-9.770431,-9.07707,-8.128829,-9.66031,-13.487442,14.80933,-0.00207,0.00207
10,7,1.0,-6.428771,-0.31957,-0.606628,-17.105795,-15.944975,-15.193404,-14.562069,-13.550648,-8.131905,...,-11.654187,-10.158894,-9.770431,-9.07707,-8.128829,-9.66031,-13.487442,14.80933,-0.00207,0.00207


In [62]:
"""
# Imputes the cross sectional median into dat_time_ser
"""
for col in crosssect_zscore.columns:
    new_col = col + '_czs'
    dat_time_ser_res.loc[:,new_col] = 0
    dat_time_ser_res.loc[:,new_col].loc[dat_time_ser_res.Market == 1] = \
    crosssect_zscore_res.loc[crosssect_zscore_res.Market == 1].loc[:,col].reindex(dat_time_ser_res.loc[dat_time_ser_res.Market == 1].index, 
                                                                                  level = 1)
    dat_time_ser_res.loc[:,new_col].loc[dat_time_ser_res.Market == 2] = \
    crosssect_zscore_res.loc[crosssect_zscore_res.Market == 2].loc[:,col].reindex(dat_time_ser_res.loc[dat_time_ser_res.Market == 2].index, 
                                                                                  level = 1)
    dat_time_ser_res.loc[:,new_col].loc[dat_time_ser_res.Market == 3] = \
    crosssect_zscore_res.loc[crosssect_zscore_res.Market == 3].loc[:,col].reindex(dat_time_ser_res.loc[dat_time_ser_res.Market == 3].index, 
                                                                                  level = 1)
    dat_time_ser_res.loc[:,new_col].loc[dat_time_ser_res.Market == 4] = \
    crosssect_zscore_res.loc[crosssect_zscore_res.Market == 4].loc[:,col].reindex(dat_time_ser_res.loc[dat_time_ser_res.Market == 4].index, 
                                                                                  level = 1)
dat_time_ser = dat_time_ser_res.set_index('Market', append = True).swaplevel(i = 0, j = 2).swaplevel(i = 1, j = 2)
dat_time_ser.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,...,x3A_czs,x3B_czs,x3C_czs,x3D_czs,x3E_czs,x4_czs,x5_czs,x6_czs,x4dp_czs,x4dm_czs
Market,Stock,Day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1.0,10,1,-4.173071,-0.604414,-0.707595,-16.822898,-15.768382,-15.040206,-14.46108,-13.775494,-8.154655,-17.169408,...,-8.351639,-8.610257,-8.481755,-8.139073,-8.805337,-4.536242,-7.850662,5.336387,0.0,0.038021
1.0,10,2,-4.41868,-0.469946,-0.757369,-17.384426,-16.218742,-15.486009,-14.693636,-13.780887,-8.174265,-17.846112,...,-8.980738,-8.546057,-8.48979,-8.060831,-7.66163,-4.380699,-7.51879,4.645397,-0.038021,0.0
1.0,10,5,-4.304582,-0.623238,-1.08243,-17.466539,-16.316185,-15.403125,-14.924859,-13.859143,-8.207112,-16.247488,...,-8.060269,-7.845817,-7.754526,-7.14344,-6.969692,-4.761012,-7.215386,5.893169,0.0,0.043033
1.0,10,6,-4.914936,-0.347112,-0.66099,-17.088707,-16.029153,-15.274344,-14.608432,-13.668939,-8.160224,-17.150688,...,-7.999158,-7.972742,-7.187555,-7.164909,-6.629463,-4.551323,-7.28225,5.538359,-0.043033,0.028425
1.0,10,7,-6.428771,-0.31957,-0.606628,-17.105795,-15.944975,-15.193404,-14.562069,-13.550648,-8.131905,-17.611517,...,-8.272854,-7.870011,-7.169904,-7.655091,-7.643407,-4.403443,-7.623935,6.011223,-0.028425,-0.031599


In [63]:
# Inputes beta into dat_time_ser
dat_time_ser['beta'] = beta.reset_index(level = 0, drop = True).reindex(dat_time_ser.index, level = 1)
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,...,x3B_czs,x3C_czs,x3D_czs,x3E_czs,x4_czs,x5_czs,x6_czs,x4dp_czs,x4dm_czs,beta
Market,Stock,Day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1.0,10,1,-4.173071,-0.604414,-0.707595,-16.822898,-15.768382,-15.040206,-14.46108,-13.775494,-8.154655,-17.169408,...,-8.610257,-8.481755,-8.139073,-8.805337,-4.536242,-7.850662,5.336387,0.0,0.038021,0.984084
1.0,10,2,-4.41868,-0.469946,-0.757369,-17.384426,-16.218742,-15.486009,-14.693636,-13.780887,-8.174265,-17.846112,...,-8.546057,-8.48979,-8.060831,-7.66163,-4.380699,-7.51879,4.645397,-0.038021,0.0,0.984084
1.0,10,5,-4.304582,-0.623238,-1.08243,-17.466539,-16.316185,-15.403125,-14.924859,-13.859143,-8.207112,-16.247488,...,-7.845817,-7.754526,-7.14344,-6.969692,-4.761012,-7.215386,5.893169,0.0,0.043033,0.984084
1.0,10,6,-4.914936,-0.347112,-0.66099,-17.088707,-16.029153,-15.274344,-14.608432,-13.668939,-8.160224,-17.150688,...,-7.972742,-7.187555,-7.164909,-6.629463,-4.551323,-7.28225,5.538359,-0.043033,0.028425,0.984084
1.0,10,7,-6.428771,-0.31957,-0.606628,-17.105795,-15.944975,-15.193404,-14.562069,-13.550648,-8.131905,-17.611517,...,-7.870011,-7.169904,-7.655091,-7.643407,-4.403443,-7.623935,6.011223,-0.028425,-0.031599,0.984084


In [65]:
ma3p = ma3p.swaplevel(i = 0, j = 1).swaplevel(i = 1, j = 2).sort_index()
ma3m = ma3m.swaplevel(i = 0, j = 1).swaplevel(i = 1, j = 2).sort_index()
ma10p = ma10p.swaplevel(i = 0, j = 1).swaplevel(i = 1, j = 2).sort_index()
ma10m = ma10m.swaplevel(i = 0, j = 1).swaplevel(i = 1, j = 2).sort_index()
ma20p = ma20p.swaplevel(i = 0, j = 1).swaplevel(i = 1, j = 2).sort_index()
ma20m = ma20m.swaplevel(i = 0, j = 1).swaplevel(i = 1, j = 2).sort_index()
maWy10p = maWy10p.swaplevel(i = 0, j = 1).swaplevel(i = 1, j = 2).sort_index()
maWy10m = maWy10m.swaplevel(i = 0, j = 1).swaplevel(i = 1, j = 2).sort_index()
maWy20p = maWy20p.swaplevel(i = 0, j = 1).swaplevel(i = 1, j = 2).sort_index()
maWy20m = maWy20m.swaplevel(i = 0, j = 1).swaplevel(i = 1, j = 2).sort_index()
maWy40p = maWy40p.swaplevel(i = 0, j = 1).swaplevel(i = 1, j = 2).sort_index()
maWy40m = maWy40m.swaplevel(i = 0, j = 1).swaplevel(i = 1, j = 2).sort_index()

In [66]:
# Imputes the moving averages into dat_time_ser
for col in maWy10p:
    new_col = col + '_ma10p'
    dat_time_ser[new_col] = maWy10p.loc[:,col]
    new_col = col + '_ma10m'
    dat_time_ser[new_col] = maWy10m.loc[:,col]
    new_col = col + '_ma20p'
    dat_time_ser[new_col] = maWy20p.loc[:,col]
    new_col = col + '_ma20m'
    dat_time_ser[new_col] = maWy20m.loc[:,col]
    new_col = col + '_ma40p'
    dat_time_ser[new_col] = maWy40p.loc[:,col]
    new_col = col + '_ma40m'
    dat_time_ser[new_col] = maWy40m.loc[:,col]
print dat_time_ser.columns.tolist()

['x0', 'x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E', 'x4', 'x5', 'x6', 'x4dp', 'x4dm', 'm1', 'm2', 'm3', 'm4', 'x0_tzs', 'x1_tzs', 'x2_tzs', 'x3A_tzs', 'x3B_tzs', 'x3C_tzs', 'x3D_tzs', 'x3E_tzs', 'x4_tzs', 'x5_tzs', 'x6_tzs', 'x4dp_tzs', 'x4dm_tzs', 'x0_czs', 'x1_czs', 'x2_czs', 'x3A_czs', 'x3B_czs', 'x3C_czs', 'x3D_czs', 'x3E_czs', 'x4_czs', 'x5_czs', 'x6_czs', 'x4dp_czs', 'x4dm_czs', 'beta', 'Weight_ma10p', 'Weight_ma10m', 'Weight_ma20p', 'Weight_ma20m', 'Weight_ma40p', 'Weight_ma40m', 'y_ma10p', 'y_ma10m', 'y_ma20p', 'y_ma20m', 'y_ma40p', 'y_ma40m']


In [67]:
# Imputes the moving averages into dat_time_ser
for col in ma3p:
    new_col = col + '_ma3p'
    dat_time_ser[new_col] = ma3p.loc[:,col]
    new_col = col + '_ma3m'
    dat_time_ser[new_col] = ma3m.loc[:,col]
    new_col = col + '_ma10p'
    dat_time_ser[new_col] = ma10p.loc[:,col]
    new_col = col + '_ma10m'
    dat_time_ser[new_col] = ma10m.loc[:,col]
    new_col = col + '_ma20p'
    dat_time_ser[new_col] = ma20p.loc[:,col]
    new_col = col + '_ma20m'
    dat_time_ser[new_col] = ma20m.loc[:,col]
print dat_time_ser.columns.tolist()

['x0', 'x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E', 'x4', 'x5', 'x6', 'x4dp', 'x4dm', 'm1', 'm2', 'm3', 'm4', 'x0_tzs', 'x1_tzs', 'x2_tzs', 'x3A_tzs', 'x3B_tzs', 'x3C_tzs', 'x3D_tzs', 'x3E_tzs', 'x4_tzs', 'x5_tzs', 'x6_tzs', 'x4dp_tzs', 'x4dm_tzs', 'x0_czs', 'x1_czs', 'x2_czs', 'x3A_czs', 'x3B_czs', 'x3C_czs', 'x3D_czs', 'x3E_czs', 'x4_czs', 'x5_czs', 'x6_czs', 'x4dp_czs', 'x4dm_czs', 'beta', 'Weight_ma10p', 'Weight_ma10m', 'Weight_ma20p', 'Weight_ma20m', 'Weight_ma40p', 'Weight_ma40m', 'y_ma10p', 'y_ma10m', 'y_ma20p', 'y_ma20m', 'y_ma40p', 'y_ma40m', 'x0_ma3p', 'x0_ma3m', 'x0_ma10p', 'x0_ma10m', 'x0_ma20p', 'x0_ma20m', 'x1_ma3p', 'x1_ma3m', 'x1_ma10p', 'x1_ma10m', 'x1_ma20p', 'x1_ma20m', 'x2_ma3p', 'x2_ma3m', 'x2_ma10p', 'x2_ma10m', 'x2_ma20p', 'x2_ma20m', 'x3A_ma3p', 'x3A_ma3m', 'x3A_ma10p', 'x3A_ma10m', 'x3A_ma20p', 'x3A_ma20m', 'x3B_ma3p', 'x3B_ma3m', 'x3B_ma10p', 'x3B_ma10m', 'x3B_ma20p', 'x3B_ma20m', 'x3C_ma3p', 'x3C_ma3m', 'x3C_ma10p', 'x3C_ma10m', 'x3C_ma20p', 'x3C_ma20m', 'x

### Days of the week

In [68]:
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,...,x5_ma10p,x5_ma10m,x5_ma20p,x5_ma20m,x6_ma3p,x6_ma3m,x6_ma10p,x6_ma10m,x6_ma20p,x6_ma20m
Market,Stock,Day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1.0,10,1,-4.173071,-0.604414,-0.707595,-16.822898,-15.768382,-15.040206,-14.46108,-13.775494,-8.154655,-17.169408,...,-16.596272,-17.621529,-16.539622,-17.395207,5.334518,5.327721,5.318689,5.272931,5.31928,5.270586
1.0,10,2,-4.41868,-0.469946,-0.757369,-17.384426,-16.218742,-15.486009,-14.693636,-13.780887,-8.174265,-17.846112,...,-16.596272,-17.621529,-16.539622,-17.395207,5.334518,5.447034,5.318689,5.201303,5.31928,5.23848
1.0,10,5,-4.304582,-0.623238,-1.08243,-17.466539,-16.316185,-15.403125,-14.924859,-13.859143,-8.207112,-16.247488,...,-16.596272,-17.381103,-16.539622,-17.235227,5.327721,5.59616,5.318689,5.147949,5.31928,5.228148
1.0,10,6,-4.914936,-0.347112,-0.66099,-17.088707,-16.029153,-15.274344,-14.608432,-13.668939,-8.160224,-17.150688,...,-16.596272,-17.381103,-16.539622,-17.285763,5.447034,5.59616,5.318689,5.146034,5.31928,5.23848
1.0,10,7,-6.428771,-0.31957,-0.606628,-17.105795,-15.944975,-15.193404,-14.562069,-13.550648,-8.131905,-17.611517,...,-16.596272,-17.465642,-16.539622,-17.285763,5.59616,5.421553,5.318689,5.146034,5.31928,5.228148


In [69]:
days = dat_time_ser.reset_index().Day
days.index = dat_time_ser.index
days.head()

Market  Stock  Day
1.0     10     1      1
               2      2
               5      5
               6      6
               7      7
Name: Day, dtype: int64

In [70]:
dat_time_ser['dwmond'] = (np.mod(days, 7) ==  5).astype('int32')
dat_time_ser['dwtues'] = (np.mod(days, 7) ==  6).astype('int32')
dat_time_ser['dwwedn'] = (np.mod(days, 7) ==  0).astype('int32')
dat_time_ser['dwthur'] = (np.mod(days, 7) ==  1).astype('int32')
dat_time_ser['dwfrid'] = (np.mod(days, 7) ==  2).astype('int32')

In [71]:
dat_time_ser.columns.tolist()

['x0',
 'x1',
 'x2',
 'x3A',
 'x3B',
 'x3C',
 'x3D',
 'x3E',
 'x4',
 'x5',
 'x6',
 'x4dp',
 'x4dm',
 'm1',
 'm2',
 'm3',
 'm4',
 'x0_tzs',
 'x1_tzs',
 'x2_tzs',
 'x3A_tzs',
 'x3B_tzs',
 'x3C_tzs',
 'x3D_tzs',
 'x3E_tzs',
 'x4_tzs',
 'x5_tzs',
 'x6_tzs',
 'x4dp_tzs',
 'x4dm_tzs',
 'x0_czs',
 'x1_czs',
 'x2_czs',
 'x3A_czs',
 'x3B_czs',
 'x3C_czs',
 'x3D_czs',
 'x3E_czs',
 'x4_czs',
 'x5_czs',
 'x6_czs',
 'x4dp_czs',
 'x4dm_czs',
 'beta',
 'Weight_ma10p',
 'Weight_ma10m',
 'Weight_ma20p',
 'Weight_ma20m',
 'Weight_ma40p',
 'Weight_ma40m',
 'y_ma10p',
 'y_ma10m',
 'y_ma20p',
 'y_ma20m',
 'y_ma40p',
 'y_ma40m',
 'x0_ma3p',
 'x0_ma3m',
 'x0_ma10p',
 'x0_ma10m',
 'x0_ma20p',
 'x0_ma20m',
 'x1_ma3p',
 'x1_ma3m',
 'x1_ma10p',
 'x1_ma10m',
 'x1_ma20p',
 'x1_ma20m',
 'x2_ma3p',
 'x2_ma3m',
 'x2_ma10p',
 'x2_ma10m',
 'x2_ma20p',
 'x2_ma20m',
 'x3A_ma3p',
 'x3A_ma3m',
 'x3A_ma10p',
 'x3A_ma10m',
 'x3A_ma20p',
 'x3A_ma20m',
 'x3B_ma3p',
 'x3B_ma3m',
 'x3B_ma10p',
 'x3B_ma10m',
 'x3B_ma20p',
 'x3B_m

### Response and weight

In [77]:
dat_time_ser['y'] = store_weight_y.stack()['y']
dat_time_ser['Weight'] = store_weight_y.stack()['Weight']
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,...,x6_ma10m,x6_ma20p,x6_ma20m,dwmond,dwtues,dwwedn,dwthur,dwfrid,y,Weight
Market,Stock,Day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1.0,10,1,-4.173071,-0.604414,-0.707595,-16.822898,-15.768382,-15.040206,-14.46108,-13.775494,-8.154655,-17.169408,...,5.272931,5.31928,5.270586,0,0,0,1,0,0.000101,37.10981
1.0,10,2,-4.41868,-0.469946,-0.757369,-17.384426,-16.218742,-15.486009,-14.693636,-13.780887,-8.174265,-17.846112,...,5.201303,5.31928,5.23848,0,0,0,0,1,7.5e-05,41.34865
1.0,10,5,-4.304582,-0.623238,-1.08243,-17.466539,-16.316185,-15.403125,-14.924859,-13.859143,-8.207112,-16.247488,...,5.147949,5.31928,5.228148,1,0,0,0,0,8e-05,47.34935
1.0,10,6,-4.914936,-0.347112,-0.66099,-17.088707,-16.029153,-15.274344,-14.608432,-13.668939,-8.160224,-17.150688,...,5.146034,5.31928,5.23848,0,1,0,0,0,5e-05,50.181863
1.0,10,7,-6.428771,-0.31957,-0.606628,-17.105795,-15.944975,-15.193404,-14.562069,-13.550648,-8.131905,-17.611517,...,5.146034,5.31928,5.228148,0,0,1,0,0,6.4e-05,38.358476


### Saving the data

In [78]:
dat_time_ser.to_csv('dat_time_ser2.csv')

In [50]:
del list_ma
del ma3p
del ma3m
del ma10p
del ma10m
del ma20p
del ma20m
del beta
del crosssect_zscore
del crosssect_zscore_res
del dat_time_ser_res
del temporal_zscore
del dat_time_ser_ma
del store_weight_y_ma

In [3]:
dat_time_ser = pd.read_csv('dat_time_ser2.csv', index_col=['Market', 'Stock', 'Day'])
dat_time_ser.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,...,x6_ma10m,x6_ma20p,x6_ma20m,dwmond,dwtues,dwwedn,dwthur,dwfrid,y,Weight
Market,Stock,Day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1.0,10,1,-4.173071,-0.604414,-0.707595,-16.822898,-15.768382,-15.040206,-14.46108,-13.775494,-8.154655,-17.169408,...,5.272931,5.31928,5.270586,0,0,0,1,0,0.000101,37.10981
1.0,10,2,-4.41868,-0.469946,-0.757369,-17.384426,-16.218742,-15.486009,-14.693636,-13.780887,-8.174265,-17.846112,...,5.201303,5.31928,5.23848,0,0,0,0,1,7.5e-05,41.34865
1.0,10,5,-4.304582,-0.623238,-1.08243,-17.466539,-16.316185,-15.403125,-14.924859,-13.859143,-8.207112,-16.247488,...,5.147949,5.31928,5.228148,1,0,0,0,0,8e-05,47.34935
1.0,10,6,-4.914936,-0.347112,-0.66099,-17.088707,-16.029153,-15.274344,-14.608432,-13.668939,-8.160224,-17.150688,...,5.146034,5.31928,5.23848,0,1,0,0,0,5e-05,50.181863
1.0,10,7,-6.428771,-0.31957,-0.606628,-17.105795,-15.944975,-15.193404,-14.562069,-13.550648,-8.131905,-17.611517,...,5.146034,5.31928,5.228148,0,0,1,0,0,6.4e-05,38.358476


## Train - validation - test split

In [4]:
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,...,x6_ma10m,x6_ma20p,x6_ma20m,dwmond,dwtues,dwwedn,dwthur,dwfrid,y,Weight
Market,Stock,Day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1.0,10,1,-4.173071,-0.604414,-0.707595,-16.822898,-15.768382,-15.040206,-14.46108,-13.775494,-8.154655,-17.169408,...,5.272931,5.31928,5.270586,0,0,0,1,0,0.000101,37.10981
1.0,10,2,-4.41868,-0.469946,-0.757369,-17.384426,-16.218742,-15.486009,-14.693636,-13.780887,-8.174265,-17.846112,...,5.201303,5.31928,5.23848,0,0,0,0,1,7.5e-05,41.34865
1.0,10,5,-4.304582,-0.623238,-1.08243,-17.466539,-16.316185,-15.403125,-14.924859,-13.859143,-8.207112,-16.247488,...,5.147949,5.31928,5.228148,1,0,0,0,0,8e-05,47.34935
1.0,10,6,-4.914936,-0.347112,-0.66099,-17.088707,-16.029153,-15.274344,-14.608432,-13.668939,-8.160224,-17.150688,...,5.146034,5.31928,5.23848,0,1,0,0,0,5e-05,50.181863
1.0,10,7,-6.428771,-0.31957,-0.606628,-17.105795,-15.944975,-15.193404,-14.562069,-13.550648,-8.131905,-17.611517,...,5.146034,5.31928,5.228148,0,0,1,0,0,6.4e-05,38.358476


In [5]:
dat_time_ser = dat_time_ser.reset_index(level = 0, drop = True)
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,...,x6_ma10m,x6_ma20p,x6_ma20m,dwmond,dwtues,dwwedn,dwthur,dwfrid,y,Weight
Stock,Day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10,1,-4.173071,-0.604414,-0.707595,-16.822898,-15.768382,-15.040206,-14.46108,-13.775494,-8.154655,-17.169408,...,5.272931,5.31928,5.270586,0,0,0,1,0,0.000101,37.10981
10,2,-4.41868,-0.469946,-0.757369,-17.384426,-16.218742,-15.486009,-14.693636,-13.780887,-8.174265,-17.846112,...,5.201303,5.31928,5.23848,0,0,0,0,1,7.5e-05,41.34865
10,5,-4.304582,-0.623238,-1.08243,-17.466539,-16.316185,-15.403125,-14.924859,-13.859143,-8.207112,-16.247488,...,5.147949,5.31928,5.228148,1,0,0,0,0,8e-05,47.34935
10,6,-4.914936,-0.347112,-0.66099,-17.088707,-16.029153,-15.274344,-14.608432,-13.668939,-8.160224,-17.150688,...,5.146034,5.31928,5.23848,0,1,0,0,0,5e-05,50.181863
10,7,-6.428771,-0.31957,-0.606628,-17.105795,-15.944975,-15.193404,-14.562069,-13.550648,-8.131905,-17.611517,...,5.146034,5.31928,5.228148,0,0,1,0,0,6.4e-05,38.358476


In [6]:
dat_time_ser = dat_time_ser.swaplevel().sort_index()
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,...,x6_ma10m,x6_ma20p,x6_ma20m,dwmond,dwtues,dwwedn,dwthur,dwfrid,y,Weight
Day,Stock,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,0,-7.637302,0.432007,-0.313051,-13.812798,-12.804765,-12.228427,-11.78404,-11.30572,-8.456395,-13.96533,...,6.291569,5.895277,5.857933,0,0,0,1,0,0.000294,2.527293
1,1,-7.637302,0.432007,-0.313051,-13.812798,-12.804765,-12.228427,-11.78404,-11.30572,-8.456395,-13.96533,...,6.345323,5.978213,6.195895,0,0,0,1,0,0.000294,2.527293
1,2,-4.840598,0.074336,-0.839569,-16.907164,-15.749613,-15.038085,-14.345418,-13.660137,-8.456395,-17.788667,...,5.659978,5.168854,5.598531,0,0,0,1,0,-8.4e-05,94.206094
1,3,-1.981737,0.301267,-0.169146,-14.844188,-13.532512,-12.712327,-12.174183,-11.426019,-6.964806,-15.411748,...,4.092286,3.713572,3.925389,0,0,0,1,0,-5.6e-05,10.851268
1,4,-6.481628,-0.418415,-1.498661,-15.025417,-13.968786,-13.37287,-12.844858,-12.217066,-6.667032,-15.815217,...,5.010635,4.60517,4.60517,0,0,0,1,0,-2.9e-05,6.587559


In [7]:
"""# Days in the training and validation set
days_trainval = np.sort(trainval.loc[:,['Day']].drop_duplicates().values.reshape(-1))
# Drops the entries that were not in the original trainval set. 
# Important to avoid the bleeding of the validation set into the train set.
dat_trainval = dat_time_ser.loc[days_trainval,:]
ref = trainval.set_index(['Day','Stock'])
dat_trainval = dat_trainval.reindex(ref.index).unstack(level = 1)
dat_trainval.head()"""

"# Days in the training and validation set\ndays_trainval = np.sort(trainval.loc[:,['Day']].drop_duplicates().values.reshape(-1))\n# Drops the entries that were not in the original trainval set. \n# Important to avoid the bleeding of the validation set into the train set.\ndat_trainval = dat_time_ser.loc[days_trainval,:]\nref = trainval.set_index(['Day','Stock'])\ndat_trainval = dat_trainval.reindex(ref.index).unstack(level = 1)\ndat_trainval.head()"

In [8]:
# Days in the training and validation set
days_trainval = np.sort(trainval.loc[:,['Day']].drop_duplicates().values.reshape(-1))
# Drops the entries that were not in the original trainval set. 
# Important to avoid the bleeding of the validation set into the train set.
dat_trainval = dat_time_ser.loc[days_trainval,:]
ref = trainval.set_index(['Day','Stock'])
dat_trainval = dat_trainval.reindex(ref.index, fill_value=np.nan)
# Sorting is necessary to slice...
dat_trainval.sort_index(inplace=True)
dat_trainval.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,...,x6_ma10m,x6_ma20p,x6_ma20m,dwmond,dwtues,dwwedn,dwthur,dwfrid,y,Weight
Day,Stock,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2,-4.840598,0.074336,-0.839569,-16.907164,-15.749613,-15.038085,-14.345418,-13.660137,-8.456395,-17.788667,...,5.659978,5.168854,5.598531,0,0,0,1,0,-8.4e-05,94.206094
1,3,-1.981737,0.301267,-0.169146,-14.844188,-13.532512,-12.712327,-12.174183,-11.426019,-6.964806,-15.411748,...,4.092286,3.713572,3.925389,0,0,0,1,0,-5.6e-05,10.851268
1,4,-6.481628,-0.418415,-1.498661,-15.025417,-13.968786,-13.37287,-12.844858,-12.217066,-6.667032,-15.815217,...,5.010635,4.60517,4.60517,0,0,0,1,0,-2.9e-05,6.587559
1,5,-10.147385,1.625753,1.703713,-11.233827,-10.30764,-10.277359,-9.523156,-10.126012,-4.056336,-10.815007,...,7.549609,6.016157,7.696213,0,0,0,1,0,0.000608,1.270334
1,6,0.138825,1.490618,1.164492,-15.401179,-14.518996,-13.718036,-13.304289,-12.988578,-6.900665,-15.396323,...,4.60517,4.620855,4.620855,0,0,0,1,0,-0.000182,66.654755


In [9]:
days_valid_mon = random.sample(days_trainval[np.mod(days_trainval,7) == 5], 12)
days_valid_tue = random.sample(days_trainval[np.mod(days_trainval,7) == 6], 12)
days_valid_wed = random.sample(days_trainval[np.mod(days_trainval,7) == 0], 12)
days_valid_thu = random.sample(days_trainval[np.mod(days_trainval,7) == 1], 12)
days_valid_fri = random.sample(days_trainval[np.mod(days_trainval,7) == 2], 12)
days_valid = np.concatenate((days_valid_mon,
                             days_valid_tue,
                             days_valid_wed,
                             days_valid_thu,
                             days_valid_fri), axis = 0)
days_valid.sort()
days_train = list(set(days_trainval).difference(set(days_valid)))
days_train.sort()

In [10]:
print days_train
print days_valid

[1, 2, 6, 7, 8, 9, 19, 20, 21, 22, 30, 41, 47, 48, 49, 54, 55, 56, 57, 58, 61, 62, 64, 65, 82, 85, 86, 89, 93, 97, 111, 113, 114, 117, 124, 127, 128, 131, 135, 138, 140, 145, 147, 148, 160, 161, 167, 168, 169, 173, 174, 183, 184, 187, 190, 197, 198, 201, 202, 203, 211, 215, 216, 219, 222, 223, 225, 226, 229, 230, 236, 237, 244, 245, 247, 264, 265, 266, 267, 268, 272, 274, 279, 280, 285, 288, 292, 295, 299, 301, 307, 313, 315, 322, 323, 327, 329, 334, 336, 344, 348, 349, 359, 362, 371, 378, 379, 384, 385, 390, 391, 392, 394, 397, 399, 404, 405, 408, 418, 419, 421, 427, 428, 429, 434, 436, 439, 446, 449, 455, 461, 462, 463, 469, 470, 471, 475, 478, 484, 497, 499, 503, 505, 506, 509, 518, 526, 527, 530, 532, 533, 534, 538, 539, 545, 548, 552, 553, 554, 567, 580, 582, 587, 589, 594, 595, 601, 602, 608, 611, 614, 615, 616, 638, 643, 649, 651, 658, 660, 663, 667, 671, 677, 678, 680, 687, 691, 692, 699, 702, 709, 712, 713, 714, 716, 720, 728, 729]
[  5  12  37  43  44  68  71  78  92  98 104 

In [11]:
train_f = dat_trainval.loc[days_train].reset_index()
train_f.head()

Unnamed: 0,Day,Stock,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,...,x6_ma10m,x6_ma20p,x6_ma20m,dwmond,dwtues,dwwedn,dwthur,dwfrid,y,Weight
0,1,2,-4.840598,0.074336,-0.839569,-16.907164,-15.749613,-15.038085,-14.345418,-13.660137,...,5.659978,5.168854,5.598531,0,0,0,1,0,-8.4e-05,94.206094
1,1,3,-1.981737,0.301267,-0.169146,-14.844188,-13.532512,-12.712327,-12.174183,-11.426019,...,4.092286,3.713572,3.925389,0,0,0,1,0,-5.6e-05,10.851268
2,1,4,-6.481628,-0.418415,-1.498661,-15.025417,-13.968786,-13.37287,-12.844858,-12.217066,...,5.010635,4.60517,4.60517,0,0,0,1,0,-2.9e-05,6.587559
3,1,5,-10.147385,1.625753,1.703713,-11.233827,-10.30764,-10.277359,-9.523156,-10.126012,...,7.549609,6.016157,7.696213,0,0,0,1,0,0.000608,1.270334
4,1,6,0.138825,1.490618,1.164492,-15.401179,-14.518996,-13.718036,-13.304289,-12.988578,...,4.60517,4.620855,4.620855,0,0,0,1,0,-0.000182,66.654755


In [12]:
valid_f = dat_trainval.loc[days_valid].reset_index()
valid_f.head()

Unnamed: 0,Day,Stock,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,...,x6_ma10m,x6_ma20p,x6_ma20m,dwmond,dwtues,dwwedn,dwthur,dwfrid,y,Weight
0,5,2,-6.364067,0.009311,-0.705024,-16.980221,-15.642327,-14.816962,-14.177621,-13.421765,...,5.592571,5.168854,5.585223,1,0,0,0,0,-9.2e-05,57.140416
1,5,3,-3.79935,0.624632,1.025426,-14.338353,-13.421543,-13.076468,-12.530978,-11.92424,...,3.950727,3.713572,3.871872,1,0,0,0,0,0.000268,10.851268
2,5,4,-4.578321,-0.34849,-0.86997,-14.881015,-13.824217,-13.002172,-12.578424,-12.268125,...,5.010635,4.60517,4.60517,1,0,0,0,0,1e-05,7.241641
3,5,5,-10.197151,-0.407699,-0.003797,-10.274302,-9.59475,-10.191462,-10.282215,-10.287542,...,7.599651,6.016157,7.696213,1,0,0,0,0,0.007035,0.619938
4,5,6,0.967255,1.96444,1.220071,-14.944758,-13.878322,-13.039083,-12.590478,-12.289489,...,4.60517,4.620855,4.620855,1,0,0,0,0,9.6e-05,94.802839


In [13]:
dat_time_ser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,...,x6_ma10m,x6_ma20p,x6_ma20m,dwmond,dwtues,dwwedn,dwthur,dwfrid,y,Weight
Day,Stock,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,0,-7.637302,0.432007,-0.313051,-13.812798,-12.804765,-12.228427,-11.78404,-11.30572,-8.456395,-13.96533,...,6.291569,5.895277,5.857933,0,0,0,1,0,0.000294,2.527293
1,1,-7.637302,0.432007,-0.313051,-13.812798,-12.804765,-12.228427,-11.78404,-11.30572,-8.456395,-13.96533,...,6.345323,5.978213,6.195895,0,0,0,1,0,0.000294,2.527293
1,2,-4.840598,0.074336,-0.839569,-16.907164,-15.749613,-15.038085,-14.345418,-13.660137,-8.456395,-17.788667,...,5.659978,5.168854,5.598531,0,0,0,1,0,-8.4e-05,94.206094
1,3,-1.981737,0.301267,-0.169146,-14.844188,-13.532512,-12.712327,-12.174183,-11.426019,-6.964806,-15.411748,...,4.092286,3.713572,3.925389,0,0,0,1,0,-5.6e-05,10.851268
1,4,-6.481628,-0.418415,-1.498661,-15.025417,-13.968786,-13.37287,-12.844858,-12.217066,-6.667032,-15.815217,...,5.010635,4.60517,4.60517,0,0,0,1,0,-2.9e-05,6.587559


In [14]:
days_test = np.sort(test.loc[:,['Day']].drop_duplicates().values.reshape(-1))
test_f = dat_time_ser.unstack(level = 1).loc[days_test,:].stack(level = 1)
test_f.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,x5,...,x6_ma10m,x6_ma20p,x6_ma20m,dwmond,dwtues,dwwedn,dwthur,dwfrid,y,Weight
Day,Stock,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
13,0,-6.938105,1.621854,0.029247,-11.992972,-11.529948,-11.323054,-11.287587,-11.050357,-4.118561,-9.840743,...,5.857933,5.895277,5.857933,0,1,0,0,0,4e-05,1.37447
13,1,-7.491554,0.180306,-0.545896,-13.914706,-12.9003,-12.256696,-11.822158,-11.373628,-4.798195,-13.726922,...,6.123471,5.978213,6.184312,0,1,0,0,0,4e-05,1.37447
13,2,-3.921709,0.060144,-0.522401,-17.454684,-16.377986,-15.663713,-14.95,-13.940641,-8.415853,-16.927601,...,5.568122,5.168854,5.522785,0,1,0,0,0,4e-05,1.37447
13,3,-4.122524,0.976835,-0.148811,-15.169608,-13.944755,-13.213639,-12.607145,-12.265567,-7.273354,-15.201424,...,3.839394,3.713572,3.828641,0,1,0,0,0,4e-05,1.37447
13,4,-6.859769,-0.711489,-1.304901,-15.004117,-14.143585,-13.468039,-12.935443,-12.22238,-7.491349,-15.729048,...,4.60517,4.60517,4.60517,0,1,0,0,0,4e-05,1.37447


In [15]:
# Makes sure that the test set is indexed like the original one
ref = test.set_index(['Day','Stock'])
test_f = test_f.reindex(ref.index)
test_f.reset_index(inplace=True)
test_f.head()

Unnamed: 0,Day,Stock,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,...,x6_ma10m,x6_ma20p,x6_ma20m,dwmond,dwtues,dwwedn,dwthur,dwfrid,y,Weight
0,13,363,-7.079731,-0.828752,-0.738463,-13.386732,-12.344427,-11.771875,-11.022577,-10.328793,...,4.60517,5.030051,4.60517,0,1,0,0,0,4e-05,1.37447
1,13,1223,-3.872806,-0.452332,-0.399287,-12.166057,-11.559387,-11.058506,-10.481641,-11.433652,...,4.60517,4.60517,4.60517,0,1,0,0,0,4e-05,1.37447
2,13,1372,-2.679753,-0.736282,-0.730193,-16.728055,-15.567931,-14.765626,-13.975601,-13.072062,...,5.051965,5.167502,5.0548,0,1,0,0,0,4e-05,1.37447
3,13,1942,-4.423987,-0.769594,-1.221117,-18.322452,-17.152169,-16.435498,-15.807505,-14.969091,...,5.297894,5.277082,5.251191,0,1,0,0,0,4e-05,1.37447
4,13,2712,-8.716556,-0.597873,-1.043181,-14.615203,-13.977629,-13.614328,-13.173071,-12.417213,...,4.992603,4.78046,5.028333,0,1,0,0,0,4e-05,1.37447


In [16]:
# Drops the Weight and y columns, which contain Nan's anyway
test_f = test_f.drop(['Weight', 'y'], axis = 1)

In [17]:
# Makes sure that the responses are the original ones
train_or = pd.read_csv('train.csv', index_col = 0)
yw_or = train_or.loc[:,['Day', 'y','Weight', 'Stock']].set_index(['Day','Stock'])
yw_or_train = yw_or.reindex(train_f.set_index(['Day','Stock']).index)
yw_or_valid = yw_or.reindex(valid_f.set_index(['Day','Stock']).index)
train_f = train_f.set_index(['Day','Stock'])
train_f.loc[:,['y','Weight']] = yw_or_train
train_f = train_f.reset_index()
valid_f = valid_f.set_index(['Day','Stock'])
valid_f.loc[:,['y','Weight']] = yw_or_valid
valid_f = valid_f.reset_index()

In [18]:
# Checks for nan's
print train_f.isnull().any().any()
print valid_f.isnull().any().any()
print test_f.isnull().any().any()

False
False
False


In [19]:
print np.isinf(train_f).sum().sum()
print np.isinf(valid_f).sum().sum()
print np.isinf(test_f).sum().sum()

0
0
0


## Saves to CSV

In [20]:
datasets = {"train_f4": train_f, "valid_f4": valid_f, "test_f4": test_f}

In [21]:
for name, dataset in datasets.iteritems():
    filename = name + ".csv"
    dataset.to_csv(filename)

In [2]:
"""# Import the data
train_f = pd.read_csv('train_f.csv', index_col = 0)
valid_f = pd.read_csv('valid_f.csv', index_col = 0)
test_f = pd.read_csv('test_f.csv', index_col = 0)"""