In [2]:
# Install optuna if you haven't already
# pip install optuna
#%load_ext 2
from utils.dataset import load_historical, load_live
from utils.pipeline_random_forest import main as random_forest_main
from utils.pipeline_xg_boost import main as xgboost_main
from utils.pipeline_light_gbm import main as light_gbm_main
from utils.pipeline_linear_regression import main as linear_regression_main
from utils._config import run_parameters
from utils.IO import save_to_json
from utils.evaluate import create_feature_importance_plot
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import os

import warnings
warnings.filterwarnings('ignore')

# Load historical (training-like) dataset
master_hist = load_historical()
X_hist = master_hist.drop(columns={'target'})
y_hist = master_hist['target']

# Simulate live (test-like) dataset
master_live = load_live()
X_live = master_hist.drop(columns={'target'})
y_live = master_live['target']

# List of feature columns
X_cols = master_live.drop(columns={'target'}).columns

In [5]:
def pull_folder_query(iteration, run_date_str, feature_set, model_name, period):
    return f'model_results\\iteration={iteration}\\run_date={run_date_str}\\feature_set={feature_set}\\model={model_name}\\period={period}\\'


def run_pipeline(model_name, master_live, pipeline_main_function):

    iteration = run_parameters['development_iteration']
    random_state = run_parameters['random_state']
    n_splits = run_parameters['n_splits']
    root = run_parameters['root']
    feature_set = run_parameters['feature_set']
    n_trials = run_parameters['n_trials']
    
    run_date = pd.Timestamp.now().date()
    run_date_str = str(run_date).replace('-','')

    period = 'train'
    model_validation, best_params, fitted_model = pipeline_main_function(X_hist, y_hist)

    folder_query = pull_folder_query(iteration, run_date_str, feature_set, model_name, period)
    os.makedirs(root + folder_query, exist_ok=True)
    save_to_json(best_params, root + folder_query + 'hyper_params.json')
    create_feature_importance_plot(fitted_model, X_cols, root + folder_query + 'feature_importance.png')
    model_validation.to_csv(root + folder_query + 'scores.csv', index=False)

    period = 'live'
    write_live_results = master_live.copy()
    write_live_results['y_pred'] = fitted_model.predict(X_live)
    write_live_results['run_type'] = 'production'

    folder_query = pull_folder_query(iteration, run_date_str, feature_set, model_name, period)
    os.makedirs(root + folder_query, exist_ok=True)
    write_live_results.to_csv(root + folder_query + 'predictions.csv', index=False)


def main():
    
    run_pipeline('linear_regression', master_live, linear_regression_main)
    run_pipeline('random_forest', master_live, random_forest_main)
    run_pipeline('xgboost', master_live, xgboost_main)
    run_pipeline('light_gbm', master_live, light_gbm_main)
  
main()


