# Introduction: Results of Time Feature Engineering

In this notebook, we'll look at the results of the time feature engineering. The purpose is to determine (if possible) the best way for encoding time and date features in a time-series problem.

In [28]:
# Standard Data Science Helpers
import numpy as np
import pandas as pd
import scipy

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks as cf
cf.set_config_file(world_readable=True, theme="pearl")
cf.go_offline(connected=True)

# Extra options
pd.options.display.max_rows = 10
pd.options.display.max_columns = 25
# Show all code cells outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


In [30]:
results = pd.read_csv('results/complete_results.csv', index_col=0)
results = results[(results['score'] > 0) & (np.isfinite(results['score']))]
results.shape
results.head()

(5689, 9)

Unnamed: 0,method,score,std,test_points,freq,dpoints,building_id,model,splits
0,baseline,94.049361,3.670228,1460,60,8760,476,LinearRegression,5
1,baseline_cyc,94.171948,1.065232,1460,60,8760,476,LinearRegression,5
2,frac,94.007481,3.697429,1460,60,8760,476,LinearRegression,5
3,frac_cyc,93.552384,4.123808,1460,60,8760,476,LinearRegression,5
4,domain,93.820686,4.287432,1460,60,8760,476,LinearRegression,5


In [31]:
building_counts = results.groupby('building_id')['method'].count()
results = results.loc[~results['building_id'].isin(building_counts.index[building_counts != 10])]

results.shape

(5300, 9)

In [32]:
lr = results.query('model == "LinearRegression"')
rr = results.query('model == "RandomForestRegressor"')

In [33]:
lr.groupby('building_id').apply(lambda x: x.loc[x['score'].idxmax(), 'method']).value_counts()

baseline_cyc    211
frac_cyc        150
domain           74
baseline         55
frac             40
dtype: int64

In [34]:
rr.groupby('building_id').apply(lambda x: x.loc[x['score'].idxmax(), 'method']).value_counts()

baseline        181
domain          135
frac             97
frac_cyc         85
baseline_cyc     32
dtype: int64

In [35]:
rr.pivot_table(index='building_id', columns='method', values='score').iplot(mode='markers', size=7, title="Random Forest Results")

In [36]:
best = []
for building_id, grouped in rr.groupby('building_id'):
    best.append(grouped.loc[grouped['score'].idxmax(), 'method'])
    
    
from collections import Counter
Counter(best)

Counter({'frac': 97,
         'frac_cyc': 85,
         'baseline': 181,
         'domain': 135,
         'baseline_cyc': 32})

In [65]:
pivoted = rr.pivot_table(index='building_id', columns='method', values='score')

In [66]:
counts = np.argmax(pivoted.values, axis=1)
npc = Counter(counts)

In [67]:
{pivoted.columns[key]: value for key, value in npc.items()}

{'frac': 97,
 'frac_cyc': 85,
 'baseline': 181,
 'domain': 135,
 'baseline_cyc': 32}

In [68]:
np.isfinite(rr['score']).all()

True

In [69]:
np.isfinite(results['score']).all()

True

In [70]:
pd.isna(results['score']).sum()

0

In [71]:
pivoted

method,baseline,baseline_cyc,domain,frac,frac_cyc
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,86.389471,63.539245,86.094453,86.526924,85.848967
2,83.654306,81.712896,83.066314,83.574129,83.832611
5,83.231666,81.627402,82.382888,83.032297,82.442607
6,72.221907,48.150737,71.315065,71.017425,71.258789
7,84.343324,61.319347,74.976061,75.856946,75.459385
...,...,...,...,...,...
633,87.746902,80.399508,87.873442,88.079448,88.274594
634,90.577505,91.381420,91.721412,90.638393,91.637766
635,81.998115,75.209446,79.028993,79.382828,78.795677
636,88.000648,83.890264,87.255369,87.274773,87.668015


In [72]:
results['score']

0    94.049361
1    94.171948
2    94.007481
3    93.552384
4    93.820686
       ...    
0    74.021140
1    66.960319
2    68.084422
3    68.169135
4    68.404804
Name: score, Length: 5300, dtype: float64

In [73]:
rr.head()

Unnamed: 0,method,score,std,test_points,freq,dpoints,building_id,model,splits
0,baseline,95.720626,1.076836,1460,60,8760,476,RandomForestRegressor,5
1,baseline_cyc,95.699119,1.131124,1460,60,8760,476,RandomForestRegressor,5
2,frac,95.53434,1.067581,1460,60,8760,476,RandomForestRegressor,5
3,frac_cyc,95.498019,1.222425,1460,60,8760,476,RandomForestRegressor,5
4,domain,96.132587,1.084322,1460,60,8760,476,RandomForestRegressor,5


In [75]:
lr.pivot_table(index='building_id', columns='method', values='score').iplot(mode='markers', size=7, title='Linear Regression Results')

In [79]:
lr_maxes = lr.groupby('building_id').apply(lambda x: x.loc[x['score'].idxmax()])
rr_maxes = rr.groupby('building_id').apply(lambda x: x.loc[x['score'].idxmax()])

maxes = pd.concat([lr_maxes, rr_maxes], axis=0)

In [83]:
maxes.pivot(columns='model', values='score').iplot(mode='markers', size=7, title='Random Forest vs Linear Regression Best Scores')

In [87]:
rr_maxes['freq_name'] = rr_maxes['freq'].apply(lambda x: f'{x} min')

rr_maxes.iplot(x='building_id', y='score', categories='freq_name', title='Random Forest Best Results by Frequency')

In [90]:
rr_maxes.pivot(columns='freq_name', values='score').iplot(kind='box')

In [80]:
maxes.melt(id_vars='model', value_vars='score')

Unnamed: 0,model,variable,value
0,LinearRegression,score,47.094650
1,LinearRegression,score,77.800188
2,LinearRegression,score,73.173768
3,LinearRegression,score,34.443725
4,LinearRegression,score,48.521038
...,...,...,...
1055,RandomForestRegressor,score,88.274594
1056,RandomForestRegressor,score,91.721412
1057,RandomForestRegressor,score,81.998115
1058,RandomForestRegressor,score,88.000648


In [49]:
lr = pd.read_csv('results/linear_model.csv', index_col=0).reset_index(drop=True)
rr= pd.read_csv('results/random_forest_model.csv', index_col=0).reset_index(drop=True)

lr = lr.loc[(np.isfinite(lr['score'])) & (lr['score'] > 0)]
rr = rr.loc[(np.isfinite(rr['score'])) & (rr['score'] > 0)]

In [31]:
lr.describe()

Unnamed: 0,score,freq,dpoints,building_id
count,1050.0,1050.0,1050.0,1050.0
mean,0.594523,53.314286,17787.822857,346.567619
std,0.244286,16.012575,23293.78253,173.970817
min,0.004943,15.0,8760.0,2.0
25%,0.440051,60.0,8760.0,238.0
50%,0.643491,60.0,8760.0,375.0
75%,0.788277,60.0,8784.0,493.0
max,0.976023,60.0,139716.0,600.0


In [32]:
rr.describe()

Unnamed: 0,score,freq,dpoints,building_id
count,1877.0,1877.0,1877.0,1877.0
mean,0.761161,53.407032,17381.57805,337.102291
std,0.169312,15.917013,22380.627153,171.355788
min,0.02746,15.0,8760.0,1.0
25%,0.698803,60.0,8760.0,225.0
50%,0.802353,60.0,8760.0,358.0
75%,0.877204,60.0,8784.0,481.0
max,0.983892,60.0,139716.0,600.0


In [35]:
rr.pivot_table(index='building_id', values='score', columns='method')

method,frac,frac_cyc,normal,normal_cyc
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.712770,0.686601,0.691134,0.279083
2,0.817534,0.827222,0.819969,0.802353
5,0.806778,0.815835,0.794974,0.807480
6,0.691102,0.696016,0.674050,0.497206
7,0.472874,0.553007,0.516633,0.400233
...,...,...,...,...
596,0.757680,0.747749,0.787795,0.675157
597,0.886680,0.909640,0.880711,0.868342
598,0.886729,0.909629,0.880679,0.868175
599,0.257906,0.283836,0.291678,0.087222


In [36]:
rr.pivot_table(index='building_id', values='score', columns='method').iplot(mode='markers', size=6)

In [37]:
rr.groupby('building_id').apply(lambda x: x.loc[x['score'].idxmax(), 'method']).value_counts()

frac_cyc      247
normal        128
normal_cyc     55
frac           41
dtype: int64

In [38]:
rr.groupby('method')['score'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
frac,469.0,0.755411,0.167938,0.083785,0.691102,0.797031,0.872455,0.982494
frac_cyc,469.0,0.77467,0.165859,0.037346,0.71913,0.815217,0.886487,0.981503
normal,471.0,0.764776,0.163083,0.032756,0.71177,0.800524,0.877222,0.983892
normal_cyc,468.0,0.749748,0.179438,0.02746,0.676513,0.797247,0.872617,0.982997


# Fractional Cyclical Encoding Works Best

In [39]:
rr['score'].idxmax()

304

In [40]:
rr.loc[rr['score'].idxmax()]

method           normal
score          0.983892
freq                 60
dpoints            8760
building_id         538
Name: 304, dtype: object

In [45]:
rr.pivot_table(index='building_id', columns='method', values='score').iplot(kind='box')

In [46]:
rr.pivot_table(index='building_id', columns='method', values='score')

method,frac,frac_cyc,normal,normal_cyc
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.712770,0.686601,0.691134,0.279083
2,0.817534,0.827222,0.819969,0.802353
5,0.806778,0.815835,0.794974,0.807480
6,0.691102,0.696016,0.674050,0.497206
7,0.472874,0.553007,0.516633,0.400233
...,...,...,...,...
596,0.757680,0.747749,0.787795,0.675157
597,0.886680,0.909640,0.880711,0.868342
598,0.886729,0.909629,0.880679,0.868175
599,0.257906,0.283836,0.291678,0.087222
