In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

#setup
import os, sys
current_path = os.path.abspath('')
parent_path = os.path.dirname(current_path)
source_path = f'{parent_path}\\src'
if not source_path in sys.path: sys.path.append(source_path)

#customs
# from data_model import *
# from data_helper import *
from config import *
setup()

Config finished!


In [2]:
df = pd.read_csv('./data/V1.ExaltedOrb.csv', index_col='Date', parse_dates=True)
df.head()

Unnamed: 0_level_0,Value,League,Confidence,LeagueDay
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-09-08,64.00417,Essence,0,1.0
2016-09-09,64.51451,Essence,0,2.0
2016-09-10,65.0028,Essence,0,3.0
2016-09-11,65.05764,Essence,0,4.0
2016-09-12,65.98811,Essence,0,5.0


In [3]:
def train_val_split(df, selected_features):
    is_last_league = df['League'] == Constants.LEAGUES[-1]

    df_selected = df[selected_features]
    df_train = df_selected[~is_last_league]
    df_val = df_selected[is_last_league]
    return df_train, df_val
def print_train_val_stats(train,val):
    print('shapes: ', train.shape, val.shape)
    display(train.head())
    display(val.head())

In [4]:
df_train, df_val = train_val_split(df, ['Value','LeagueDay','Confidence'])
print_train_val_stats(df_train, df_val)

shapes:  (1719, 3) (88, 3)


Unnamed: 0_level_0,Value,LeagueDay,Confidence
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-09-08,64.00417,1.0,0
2016-09-09,64.51451,2.0,0
2016-09-10,65.0028,3.0,0
2016-09-11,65.05764,4.0,0
2016-09-12,65.98811,5.0,0


Unnamed: 0_level_0,Value,LeagueDay,Confidence
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-07-23,50.0,1.0,1
2021-07-24,62.0,2.0,0
2021-07-25,72.57994,3.0,0
2021-07-26,77.0,4.0,0
2021-07-27,75.9145,5.0,0


In [84]:
def make_lags(ts, lags, lead_time=1):
    return pd.concat(
        {
            f'y_lag_{i}': ts.shift(i)
            for i in range(lead_time, lags + lead_time)
        },
        axis=1)
def make_multi_target(ts, steps, lead_time=1):
    return pd.concat(
        {
            f'y_step_{i}': ts.shift(-i)
            for i in range(lead_time, steps + lead_time)
        },
        axis=1)

from statsmodels.tsa.deterministic import DeterministicProcess

class FeatureEngineer:
    # def trend_features(self, df, training = True):
    #     if training:
    #         self.dp = DeterministicProcess(
    #             index = df.index,
    #             order = 4,
    #             drop = True
    #         )
    #         X = self.dp.in_sample().fillna(0)
    #     else:
    #         X = self.dp.out_of_sample(df.shape[0]).fillna(0)
        
    #     return X
    # def residual_features(self, df, training = True, lags = 4):
    #     if training:
    #         self.dp = DeterministicProcess(
    #             index = df.index,
    #             order = 1,
    #             drop = True
    #         )
    #         index_features = self.dp.in_sample().fillna(0)
    #     else:
    #         index_features = self.dp.out_of_sample(df.index).fillna(0)

    #     lags = make_lags(df['Value'], lags).fillna(0)
    #     selected_features = df[['Confidence', 'LeagueDay']]
    #     return pd.concat(
    #         [index_features, lags, selected_features],
    #         axis = 1
    #     )
    def target(self, df, steps = 5):
        return make_multi_target(df['Value'], steps).fillna(0)
    
    def time_features(self, df):
        df_return = df[['LeagueDay']]
        time_index = df.index


        df_return['day_of_week'] = time_index.dayofweek
        df_return['month'] = time_index.month

        return df_return

    def lag_features(self, df, lags = 4):
        return make_lags(df['Value'], lags).fillna(0)

    def rolling_features(self, df, local_window_size = 5, general_window_size=15):
        local_rolling = df['Value'].rolling(local_window_size, center = True)
        general_rolling = df['Value'].rolling(general_window_size, center = True)

        df_return = pd.DataFrame()
        df_return['rolling_mean'] = local_rolling.mean()
        df_return['rolling_min'] = local_rolling.min()
        df_return['rolling_max'] = local_rolling.max()
        df_return['general_rolling_mean'] = general_rolling.mean()
        df_return['general_rolling_min'] = general_rolling.min()
        df_return['general_rolling_max'] = general_rolling.max()
        return df_return        
    
    def expanded_features(self, df, min_period = 2):
        expanding = df['Value'].expanding(min_period, center = True)
        pd_return = pd.DataFrame()
        pd_return['expanding_mean'] = expanding.mean()
        pd_return['expanding_max'] = expanding.max()
        pd_return['expanding_min'] = expanding.min()
        return pd_return

    def all_features(self, df, fill0 = True):
        df_return =  pd.concat(
            [
                self.time_features(df),
                self.lag_features(df),
                self.rolling_features(df),
                self.expanded_features(df),
            ],
            
            axis = 1
        )
        return df_return.fillna(0) if fill0 else df_return

In [88]:
feature_engineer = FeatureEngineer()

X_train = feature_engineer.all_features(df_train)
y_train = feature_engineer.target(df_train)
X_val = feature_engineer.all_features(df_val)
y_val = feature_engineer.target(df_val)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [89]:
for x in [X_train, X_val, y_train, y_val]:
    print(x.shape)
    display(x.head())

(1719, 16)


Unnamed: 0_level_0,LeagueDay,day_of_week,month,y_lag_1,y_lag_2,y_lag_3,y_lag_4,rolling_mean,rolling_min,rolling_max,general_rolling_mean,general_rolling_min,general_rolling_max,expanding_mean,expanding_max,expanding_min
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2016-09-08,1.0,3,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-09-09,2.0,4,9,64.00417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.25934,64.51451,64.00417
2016-09-10,3.0,5,9,64.51451,64.00417,0.0,0.0,64.913446,64.00417,65.98811,0.0,0.0,0.0,64.50716,65.0028,64.00417
2016-09-11,4.0,6,9,65.0028,64.51451,64.00417,0.0,65.31606,64.51451,66.01724,0.0,0.0,0.0,64.64478,65.05764,64.00417
2016-09-12,5.0,0,9,65.05764,65.0028,64.51451,64.00417,66.230082,65.0028,69.08462,0.0,0.0,0.0,64.913446,65.98811,64.00417


(88, 16)


Unnamed: 0_level_0,LeagueDay,day_of_week,month,y_lag_1,y_lag_2,y_lag_3,y_lag_4,rolling_mean,rolling_min,rolling_max,general_rolling_mean,general_rolling_min,general_rolling_max,expanding_mean,expanding_max,expanding_min
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2021-07-23,1.0,4,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-07-24,2.0,5,7,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0,62.0,50.0
2021-07-25,3.0,6,7,62.0,50.0,0.0,0.0,67.498888,50.0,77.0,0.0,0.0,0.0,61.526647,72.57994,50.0
2021-07-26,4.0,0,7,72.57994,62.0,50.0,0.0,72.498888,62.0,77.0,0.0,0.0,0.0,65.394985,77.0,50.0
2021-07-27,5.0,1,7,77.0,72.57994,62.0,50.0,75.298888,72.57994,77.0,0.0,0.0,0.0,67.498888,77.0,50.0


(1719, 5)


Unnamed: 0_level_0,y_step_1,y_step_2,y_step_3,y_step_4,y_step_5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-09-08,64.51451,65.0028,65.05764,65.98811,66.01724
2016-09-09,65.0028,65.05764,65.98811,66.01724,69.08462
2016-09-10,65.05764,65.98811,66.01724,69.08462,69.83986
2016-09-11,65.98811,66.01724,69.08462,69.83986,68.44797
2016-09-12,66.01724,69.08462,69.83986,68.44797,66.80162


(88, 5)


Unnamed: 0_level_0,y_step_1,y_step_2,y_step_3,y_step_4,y_step_5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-07-23,62.0,72.57994,77.0,75.9145,75.0
2021-07-24,72.57994,77.0,75.9145,75.0,76.0
2021-07-25,77.0,75.9145,75.0,76.0,80.0
2021-07-26,75.9145,75.0,76.0,80.0,93.13224
2021-07-27,75.0,76.0,80.0,93.13224,93.0


In [91]:
out_names = ['X_train', 'X_val', 'y_train', 'y_val']
outs_dfs = [X_train, X_val, y_train, y_val]
for name, df in zip(out_names, outs_dfs):
    out_name = f'./data/{name}.csv'
    print(f'Saving {out_name}')
    df.reset_index().to_csv(out_name, index=False)


Saving ./data/X_train.csv
Saving ./data/X_val.csv
Saving ./data/y_train.csv
Saving ./data/y_val.csv
