In [2]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

#setup
import os, sys
current_path = os.path.abspath('')
# parent_path = os.path.dirname(current_path)
parent_path = current_path
source_path = f'{parent_path}\\src'
if not source_path in sys.path: sys.path.append(source_path)

#customs
# from data_model import *
# from data_helper import *
from config import *
setup()

Config finished!


In [5]:
df = pd.read_csv('./data-training/Exalted Orb.csv', index_col='Date', parse_dates=True)
df.head()

Unnamed: 0_level_0,Value,League,Confidence
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-09-08,64.00417,Essence,0
2016-09-09,64.51451,Essence,0
2016-09-10,65.0028,Essence,0
2016-09-11,65.05764,Essence,0
2016-09-12,65.98811,Essence,0


# Engineer

In [13]:
def make_lags(ts, lags, lead_time=1):
    return pd.concat(
        {
            f'y_lag_{i}': ts.shift(i)
            for i in range(lead_time, lags + lead_time)
        },
        axis=1)
def make_multi_target(ts, steps, lead_time=1):
    return pd.concat(
        {
            f'y_step_{i}': ts.shift(-i)
            for i in range(lead_time, steps + lead_time)
        },
        axis=1)

class FeatureEngineer:
    def target(self, df, steps = 5):
        return pd.concat([
            make_multi_target(df['Value'], steps).fillna(0),
            df[['League']]
        ], axis = 1)
    
    def time_features(self, df):
        df_return = pd.DataFrame()
        df_return['LeagueDay'] = (df.groupby('League').cumcount() + 1).astype(np.float64)
        time_index = df.index


        df_return['day_of_week'] = time_index.dayofweek
        df_return['month'] = time_index.month

        return df_return

    def lag_features(self, df, lags = 4):
        return make_lags(df['Value'], lags).fillna(0)

    def rolling_features(self, df, local_window_size = 5, general_window_size=15):
        local_rolling = df['Value'].rolling(local_window_size, center = True)
        general_rolling = df['Value'].rolling(general_window_size, center = True)

        df_return = pd.DataFrame()
        df_return['rolling_mean'] = local_rolling.mean()
        df_return['rolling_min'] = local_rolling.min()
        df_return['rolling_max'] = local_rolling.max()
        df_return['general_rolling_mean'] = general_rolling.mean()
        df_return['general_rolling_min'] = general_rolling.min()
        df_return['general_rolling_max'] = general_rolling.max()
        return df_return        
    
    def expanded_features(self, df, min_period = 2):
        expanding = df['Value'].expanding(min_period, center = True)
        pd_return = pd.DataFrame()
        pd_return['expanding_mean'] = expanding.mean()
        pd_return['expanding_max'] = expanding.max()
        pd_return['expanding_min'] = expanding.min()
        return pd_return

    def all_features(self, df, fill0 = True):
        df_return =  pd.concat(
            [
                self.time_features(df),
                self.lag_features(df),
                self.rolling_features(df),
                self.expanded_features(df),
                df[['League']]
            ],
            
            axis = 1
        )
        return df_return.fillna(0) if fill0 else df_return

In [14]:
feature_engineer = FeatureEngineer()

# X_train = feature_engineer.all_features(df_train)
# y_train = feature_engineer.target(df_train)
# X_val = feature_engineer.all_features(df_val)
# y_val = feature_engineer.target(df_val)
X = feature_engineer.all_features(df)
y = feature_engineer.target(df)

display(X.head(1))
display(y.head(1))



Unnamed: 0_level_0,LeagueDay,day_of_week,month,y_lag_1,y_lag_2,y_lag_3,y_lag_4,rolling_mean,rolling_min,rolling_max,general_rolling_mean,general_rolling_min,general_rolling_max,expanding_mean,expanding_max,expanding_min,League
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2016-09-08,1.0,3,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Essence


Unnamed: 0_level_0,y_step_1,y_step_2,y_step_3,y_step_4,y_step_5,League
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-09-08,64.51451,65.0028,65.05764,65.98811,66.01724,Essence


# Split

In [20]:
def train_val_split(df):
    is_last_league = df['League'] == Constants.LEAGUES[-1]

    df_train = df[~is_last_league].drop('League', axis = 1)
    df_val = df[is_last_league].drop('League', axis = 1)
    return df_train, df_val
def print_train_val_stats(train,val):
    print('shapes: ', train.shape, val.shape)
    display(train.head(2))
    display(val.head(2))

In [21]:
X_train, X_val = train_val_split(X)
y_train, y_val = train_val_split(y)

print_train_val_stats(X_train, X_val)
print_train_val_stats(y_train, y_val)

shapes:  (1719, 16) (88, 16)


Unnamed: 0_level_0,LeagueDay,day_of_week,month,y_lag_1,y_lag_2,y_lag_3,y_lag_4,rolling_mean,rolling_min,rolling_max,general_rolling_mean,general_rolling_min,general_rolling_max,expanding_mean,expanding_max,expanding_min
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2016-09-08,1.0,3,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-09-09,2.0,4,9,64.00417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.25934,64.51451,64.00417


Unnamed: 0_level_0,LeagueDay,day_of_week,month,y_lag_1,y_lag_2,y_lag_3,y_lag_4,rolling_mean,rolling_min,rolling_max,general_rolling_mean,general_rolling_min,general_rolling_max,expanding_mean,expanding_max,expanding_min
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2021-07-23,1.0,4,7,104.50518,105.0,105.6665,105.89479,78.817024,50.0,105.0,87.189063,50.0,106.27503,111.78367,205.0,18.01531
2021-07-24,2.0,5,7,50.0,104.50518,105.0,105.6665,73.217024,50.0,104.50518,86.312877,50.0,106.0,111.754743,205.0,18.01531


shapes:  (1719, 5) (88, 5)


Unnamed: 0_level_0,y_step_1,y_step_2,y_step_3,y_step_4,y_step_5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-09-08,64.51451,65.0028,65.05764,65.98811,66.01724
2016-09-09,65.0028,65.05764,65.98811,66.01724,69.08462


Unnamed: 0_level_0,y_step_1,y_step_2,y_step_3,y_step_4,y_step_5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-07-23,62.0,72.57994,77.0,75.9145,75.0
2021-07-24,72.57994,77.0,75.9145,75.0,76.0


In [22]:
out_names = ['X_train', 'X_val', 'y_train', 'y_val']
outs_dfs = [X_train, X_val, y_train, y_val]
for name, df in zip(out_names, outs_dfs):
    out_name = f'./data-training/{name}.csv'
    print(f'Saving {out_name}')
    df.reset_index().to_csv(out_name, index=False)


Saving ./data-training/X_train.csv
Saving ./data-training/X_val.csv
Saving ./data-training/y_train.csv
Saving ./data-training/y_val.csv
