In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMRegressor
import optuna
import cufflinks as cf
import warnings
import datetime

from ml_tools.datasets import generate_synthetic_data
from ml_tools.eval import get_mae_from_cv_time_series
from ml_tools.train_start_selector import TrainStartSelector
from ml_tools.feature_selector import FeatureSelector
from ml_tools.tuner import Tuner
from ml_tools.model_assumption_selector import ModelAssumptionSelector

cf.go_offline()
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = 20, 4
pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')

In [None]:
# Shared assumption
cv_start = datetime.date(2021, 6, 10)
label = 'label'

## 0) ModelAssumptionSelector - Combined Pipe Demo

In [None]:
df = generate_synthetic_data(
    freq='d',
    weekday_offset=True,
    yearly_offset=True,
    trend_stop_value=100
)

noise = np.random.normal(size=len(df), scale=5)
df['feature'] = df.label + noise
chaos_slice = df.index < '2020-01-01'
df.loc[chaos_slice, 'feature'] = (
    df.loc[chaos_slice, label] + 
    np.random.normal(size=sum(chaos_slice), scale=50))
df['trash_feature'] = np.random.normal(size=len(df), scale=30)

mas = ModelAssumptionSelector(
    
    selectors = (
        
        # 1) Select train start
        TrainStartSelector(
            eval_window_rows = len(df[cv_start:]),
            min_train_rows = 7*4,
            n_trials=3
        ),
        
        # 2) Select best features
        FeatureSelector(),
        
        # 3) Select best hyperparameters
        Tuner(
            lazy_optuna_space=[
                ('learning_rate', 'trial.suggest_float', 0.03, 0.3),
                ('n_estimators', 'trial.suggest_int', 10, 100),
            ],
            n_trials=20
        )
    )
)
    
best_assumptions = mas.run(
    eval_func=get_mae_from_cv_time_series,
    df=df,
    model_ref=LGBMRegressor,
    feature_list=[i for i in df.columns if i != label]
)

best_assumptions

## 1) Train start selector

In [None]:
df = generate_synthetic_data(
    freq='d',
    weekday_offset=True,
    yearly_offset=True,
    trend_stop_value=100
)

noise = np.random.normal(size=len(df), scale=5)
df['feature'] = df.label + noise
chaos_slice = df.index < '2020-01-01'
df.loc[chaos_slice, 'feature'] = (
    df.loc[chaos_slice, label] + 
    np.random.normal(size=sum(chaos_slice), scale=50))

In [None]:
df.iplot(rangeslider=True)

In [None]:
tss = TrainStartSelector(
    eval_window_rows = len(df[cv_start:]),
    min_train_rows= 7*4,
    n_trials=20
)

tss.run(
    eval_func=get_mae_from_cv_time_series,
    df=df,
    model_ref=LGBMRegressor,
    cv_start = cv_start
)

## 2) Feature Selector

In [None]:
df = generate_synthetic_data(
    freq='d',
    weekday_offset=True,
    yearly_offset=True,
    trend_stop_value=100
)

In [None]:
noise = np.random.normal(size=len(df), scale=0.3)
df['signal_decay'] = df.label + noise * np.linspace(30, 100, len(df))
df['signal_improves'] = df.label + noise * np.linspace(100, 30, len(df))

In [None]:
df[[label, 'signal_decay', 'signal_improves']].iplot(rangeslider=True)

In [None]:
fs = FeatureSelector()
best_features = fs.run(
    eval_func=get_mae_from_cv_time_series,
    df=df,
    model_ref=LGBMRegressor,
    feature_list=[i for i in df.columns if i != label]
)

## 3) Tuner

In [None]:
Tuner(
    lazy_optuna_space=[
        ('learning_rate', 'trial.suggest_float', 0.03, 0.3),
        ('n_estimators', 'trial.suggest_int', 10, 100),
    ],
    n_trials=5,
).run(
    eval_func=get_mae_from_cv_time_series,
    df=df,
    model_ref=LGBMRegressor,
    feature_list=[i for i in df.columns if i != label]    
)