# Import

In [1]:
import numpy as np
import pandas as pd

import fastparquet
from fastparquet import write, ParquetFile

import os
import matplotlib.pyplot as plt
import seaborn as sns

import gc
import random
from tqdm.autonotebook import tqdm
import datetime 

from scipy.stats import norm

from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


import lightgbm as lgb

import mlflow
from mlflow.tracking import MlflowClient

# Support warning
import warnings
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import tqdm


# Settings

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [3]:
mlflow.set_tracking_uri("postgresql://mlflow:mlflow@localhost/mlflow")
mlflow.set_registry_uri("file:/home/podsyp/mlruns")

In [4]:
model_params = {
    'model_type': 'dev',
    'batch_size': 64, 
    'seed': 777,
    'version': 1,
    'window': 5,
    'lag_type': 'm',
    'target_type': 'regression',
    'train_years': [2015, 2016],
    'valid_years': [2017],
    'test_years': [2018],
}

In [5]:
def global_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    str_seed = str(seed)
    os.environ["PYTHONHASHSEED"] = str_seed
    #torch.manual_seed(seed)
    #torch.cuda.manual_seed(seed)
    
global_seed(model_params['seed'])

# Read Parquet File

In [6]:
tmp = []
for y in model_params['train_years']:
    pf = ParquetFile('model_data/X_' + str(y) + '.parq')
    tmp.append(pf.to_pandas())
X_train = pd.concat(tmp)

In [7]:
tmp = []
for y in model_params['valid_years']:
    pf = ParquetFile('model_data/X_' + str(y) + '.parq')
    tmp.append(pf.to_pandas())
X_val = pd.concat(tmp)

In [8]:
tmp = []
for y in model_params['test_years']:
    pf = ParquetFile('model_data/X_' + str(y) + '.parq')
    tmp.append(pf.to_pandas())
X_test = pd.concat(tmp)

In [9]:
X_train.shape, X_val.shape, X_test.shape

((17472, 104), (8760, 104), (2952, 104))

In [10]:
X_train.head()

Unnamed: 0_level_0,date_time,T_data_1_1,T_data_1_2,T_data_1_3,T_data_2_1,T_data_2_2,T_data_2_3,T_data_3_1,T_data_3_2,T_data_3_3,T_data_4_1,T_data_4_2,T_data_4_3,T_data_5_1,T_data_5_2,T_data_5_3,H_data,AH_data,T_data_1_1_1,T_data_1_2_1,T_data_1_3_1,T_data_2_1_1,T_data_2_2_1,T_data_2_3_1,T_data_3_1_1,T_data_3_2_1,T_data_3_3_1,T_data_4_1_1,T_data_4_2_1,T_data_4_3_1,T_data_5_1_1,T_data_5_2_1,T_data_5_3_1,H_data_1,AH_data_1,T_data_1_1_2,T_data_1_2_2,T_data_1_3_2,T_data_2_1_2,T_data_2_2_2,T_data_2_3_2,T_data_3_1_2,T_data_3_2_2,T_data_3_3_2,T_data_4_1_2,T_data_4_2_2,T_data_4_3_2,T_data_5_1_2,T_data_5_2_2,T_data_5_3_2,H_data_2,AH_data_2,T_data_1_1_3,T_data_1_2_3,T_data_1_3_3,T_data_2_1_3,T_data_2_2_3,T_data_2_3_3,T_data_3_1_3,T_data_3_2_3,T_data_3_3_3,T_data_4_1_3,T_data_4_2_3,T_data_4_3_3,T_data_5_1_3,T_data_5_2_3,T_data_5_3_3,H_data_3,AH_data_3,T_data_1_1_4,T_data_1_2_4,T_data_1_3_4,T_data_2_1_4,T_data_2_2_4,T_data_2_3_4,T_data_3_1_4,T_data_3_2_4,T_data_3_3_4,T_data_4_1_4,T_data_4_2_4,T_data_4_3_4,T_data_5_1_4,T_data_5_2_4,T_data_5_3_4,H_data_4,AH_data_4,T_data_1_1_5,T_data_1_2_5,T_data_1_3_5,T_data_2_1_5,T_data_2_2_5,T_data_2_3_5,T_data_3_1_5,T_data_3_2_5,T_data_3_3_5,T_data_4_1_5,T_data_4_2_5,T_data_4_3_5,T_data_5_1_5,T_data_5_2_5,T_data_5_3_5,H_data_5,AH_data_5,quality
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1
0,2015-01-04 00:05:00,277,326,273,322,335,352,505,501,670,326,379,337,231,236,242,153.77,7.9,277,327,273,322,335,352,505,501,667,326,378,337,231,236,242,155.45,7.9,276,328,273,322,335,351,505,501,664,326,378,337,231,236,242,154.12,7.9,276,328,272,322,335,351,505,501,661,325,377,337,231,236,243,158.29,7.9,276,329,272,322,335,351,505,501,658,325,377,337,231,236,243,152.62,7.9,276,330,272,323,335,351,505,501,655,325,377,337,231,236,243,157.74,7.9,392
1,2015-01-04 01:05:00,277,253,272,320,333,355,500,501,687,337,396,335,234,242,230,158.27,6.96,277,254,272,320,333,356,500,501,690,337,396,335,234,242,230,156.0,6.96,277,256,272,320,333,356,500,501,693,336,396,336,234,241,230,159.17,6.96,277,257,272,320,333,356,500,501,695,336,396,336,234,241,230,156.39,6.96,277,259,272,320,333,356,500,501,697,336,396,336,233,241,230,156.31,6.96,277,260,272,320,333,356,500,502,700,336,395,336,233,241,230,158.7,6.96,384
2,2015-01-04 02:05:00,262,218,260,326,336,330,505,499,443,347,399,332,243,251,240,153.36,7.29,262,218,260,326,336,330,505,499,443,347,399,332,243,251,240,154.15,7.29,263,218,260,326,336,331,505,499,442,347,399,333,243,251,239,156.51,7.29,264,218,261,326,336,332,505,499,442,347,399,333,242,251,239,156.11,7.29,264,218,261,325,336,332,505,499,442,347,399,333,242,250,239,155.23,7.29,265,218,261,325,336,333,504,499,443,347,399,333,242,250,238,155.58,7.29,393
3,2015-01-04 03:05:00,243,238,252,327,329,308,520,498,540,342,387,334,257,258,246,153.21,7.11,243,238,252,327,329,308,520,498,540,342,387,334,256,258,246,154.79,7.11,243,237,252,327,329,308,520,498,539,342,387,334,256,258,246,154.73,7.11,243,237,252,327,329,308,520,498,538,342,388,334,256,258,246,161.53,7.11,243,237,252,327,330,308,519,498,538,342,388,333,256,258,246,156.34,7.11,243,237,252,327,330,308,519,498,537,342,388,333,256,258,247,155.93,7.11,399
4,2015-01-04 04:05:00,236,238,245,323,320,318,522,501,524,343,371,344,264,263,265,195.71,7.97,236,238,245,323,320,318,522,501,525,343,371,344,264,263,264,195.6,7.97,236,238,245,323,321,318,523,501,525,343,371,344,264,263,263,191.37,7.97,237,238,246,324,321,318,523,501,526,343,371,343,264,263,262,196.53,7.97,237,238,246,324,321,318,523,501,526,343,372,343,264,263,261,194.83,7.97,237,239,246,324,321,317,523,501,527,342,372,343,263,263,261,191.98,7.97,400


# Prepare Data

In [11]:
y_train, y_val, y_test = X_train['quality'], X_val['quality'], X_test['quality']
X_train.drop(['quality'], axis=1, inplace=True)
X_val.drop(['quality'], axis=1, inplace=True)
X_test.drop(['quality'], axis=1, inplace=True)
X_train.drop(['date_time'], axis=1, inplace=True)
X_val.drop(['date_time'], axis=1, inplace=True)
X_test.drop(['date_time'], axis=1, inplace=True)

In [12]:
X_train.head()

Unnamed: 0_level_0,T_data_1_1,T_data_1_2,T_data_1_3,T_data_2_1,T_data_2_2,T_data_2_3,T_data_3_1,T_data_3_2,T_data_3_3,T_data_4_1,T_data_4_2,T_data_4_3,T_data_5_1,T_data_5_2,T_data_5_3,H_data,AH_data,T_data_1_1_1,T_data_1_2_1,T_data_1_3_1,T_data_2_1_1,T_data_2_2_1,T_data_2_3_1,T_data_3_1_1,T_data_3_2_1,T_data_3_3_1,T_data_4_1_1,T_data_4_2_1,T_data_4_3_1,T_data_5_1_1,T_data_5_2_1,T_data_5_3_1,H_data_1,AH_data_1,T_data_1_1_2,T_data_1_2_2,T_data_1_3_2,T_data_2_1_2,T_data_2_2_2,T_data_2_3_2,T_data_3_1_2,T_data_3_2_2,T_data_3_3_2,T_data_4_1_2,T_data_4_2_2,T_data_4_3_2,T_data_5_1_2,T_data_5_2_2,T_data_5_3_2,H_data_2,AH_data_2,T_data_1_1_3,T_data_1_2_3,T_data_1_3_3,T_data_2_1_3,T_data_2_2_3,T_data_2_3_3,T_data_3_1_3,T_data_3_2_3,T_data_3_3_3,T_data_4_1_3,T_data_4_2_3,T_data_4_3_3,T_data_5_1_3,T_data_5_2_3,T_data_5_3_3,H_data_3,AH_data_3,T_data_1_1_4,T_data_1_2_4,T_data_1_3_4,T_data_2_1_4,T_data_2_2_4,T_data_2_3_4,T_data_3_1_4,T_data_3_2_4,T_data_3_3_4,T_data_4_1_4,T_data_4_2_4,T_data_4_3_4,T_data_5_1_4,T_data_5_2_4,T_data_5_3_4,H_data_4,AH_data_4,T_data_1_1_5,T_data_1_2_5,T_data_1_3_5,T_data_2_1_5,T_data_2_2_5,T_data_2_3_5,T_data_3_1_5,T_data_3_2_5,T_data_3_3_5,T_data_4_1_5,T_data_4_2_5,T_data_4_3_5,T_data_5_1_5,T_data_5_2_5,T_data_5_3_5,H_data_5,AH_data_5
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1
0,277,326,273,322,335,352,505,501,670,326,379,337,231,236,242,153.77,7.9,277,327,273,322,335,352,505,501,667,326,378,337,231,236,242,155.45,7.9,276,328,273,322,335,351,505,501,664,326,378,337,231,236,242,154.12,7.9,276,328,272,322,335,351,505,501,661,325,377,337,231,236,243,158.29,7.9,276,329,272,322,335,351,505,501,658,325,377,337,231,236,243,152.62,7.9,276,330,272,323,335,351,505,501,655,325,377,337,231,236,243,157.74,7.9
1,277,253,272,320,333,355,500,501,687,337,396,335,234,242,230,158.27,6.96,277,254,272,320,333,356,500,501,690,337,396,335,234,242,230,156.0,6.96,277,256,272,320,333,356,500,501,693,336,396,336,234,241,230,159.17,6.96,277,257,272,320,333,356,500,501,695,336,396,336,234,241,230,156.39,6.96,277,259,272,320,333,356,500,501,697,336,396,336,233,241,230,156.31,6.96,277,260,272,320,333,356,500,502,700,336,395,336,233,241,230,158.7,6.96
2,262,218,260,326,336,330,505,499,443,347,399,332,243,251,240,153.36,7.29,262,218,260,326,336,330,505,499,443,347,399,332,243,251,240,154.15,7.29,263,218,260,326,336,331,505,499,442,347,399,333,243,251,239,156.51,7.29,264,218,261,326,336,332,505,499,442,347,399,333,242,251,239,156.11,7.29,264,218,261,325,336,332,505,499,442,347,399,333,242,250,239,155.23,7.29,265,218,261,325,336,333,504,499,443,347,399,333,242,250,238,155.58,7.29
3,243,238,252,327,329,308,520,498,540,342,387,334,257,258,246,153.21,7.11,243,238,252,327,329,308,520,498,540,342,387,334,256,258,246,154.79,7.11,243,237,252,327,329,308,520,498,539,342,387,334,256,258,246,154.73,7.11,243,237,252,327,329,308,520,498,538,342,388,334,256,258,246,161.53,7.11,243,237,252,327,330,308,519,498,538,342,388,333,256,258,246,156.34,7.11,243,237,252,327,330,308,519,498,537,342,388,333,256,258,247,155.93,7.11
4,236,238,245,323,320,318,522,501,524,343,371,344,264,263,265,195.71,7.97,236,238,245,323,320,318,522,501,525,343,371,344,264,263,264,195.6,7.97,236,238,245,323,321,318,523,501,525,343,371,344,264,263,263,191.37,7.97,237,238,246,324,321,318,523,501,526,343,371,343,264,263,262,196.53,7.97,237,238,246,324,321,318,523,501,526,343,372,343,264,263,261,194.83,7.97,237,239,246,324,321,317,523,501,527,342,372,343,263,263,261,191.98,7.97


# Baseline Model

In [11]:
def get_metrics(y_true, y_pred, set_='train', model_='baseline'):
    r_2 = metrics.r2_score(y_true, y_pred)
    mae = metrics.mean_absolute_error(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    medae = metrics.median_absolute_error(y_true, y_pred)
    msle = metrics.mean_squared_log_error(y_true, y_pred)
    print('Metrics for model: ' + model_ + ' on ' + set_ + ' dataset. ')
    print('R^2: ', r_2)
    print('Mean absolute error: ', mae)
    print('Mean squared error: ', mse)
    print('Median absolute error: ', medae)
    print('Mean squared log error: ', msle)
    return (model_, set_, r_2, mae, mse, medae, msle)

In [12]:
average = y_train.mean()
average

402.31570512820514

In [13]:
metrics_avg_train = get_metrics(y_train, [average for l in range(len(y_train))], set_='train', model_='baseline average')
metrics_avg_val = get_metrics(y_val, [average for l in range(len(y_val))], set_='val', model_='baseline average')
metrics_avg_test = get_metrics(y_test, [average for l in range(len(y_test))], set_='test', model_='baseline average')

Metrics for model: baseline average on train dataset. 
R^2:  0.0
Mean absolute error:  37.57635752001737
Mean squared error:  2119.0626471218416
Median absolute error:  33.68429487179486
Mean squared log error:  0.014301898722136372
Metrics for model: baseline average on val dataset. 
R^2:  -0.0023344406773935233
Mean absolute error:  37.84591236389182
Mean squared error:  2139.9343341408066
Median absolute error:  33.68429487179486
Mean squared log error:  0.014215843335538818
Metrics for model: baseline average on test dataset. 
R^2:  -0.0014718126690811406
Mean absolute error:  39.35503635084428
Mean squared error:  2277.6954598485368
Median absolute error:  36.31570512820514
Mean squared log error:  0.01565256300626607


In [14]:
client = MlflowClient(tracking_uri="postgresql://mlflow:mlflow@localhost/mlflow", registry_uri="file:/home/podsyp/mlruns")
exp_id_cl = mlflow.create_experiment('production_quality')

In [15]:
run_cl = client.create_run(experiment_id=exp_id_cl, start_time=None, tags=None)

In [16]:
exp_id_cl

'23'

In [17]:
tags = dict()
tags['model'] = 'Baseline Average'
tags['model_type'] = model_params['model_type']
tags['target_type'] = model_params['target_type']
tags['seed'] = model_params['seed']
tags['version'] = model_params['version']
tags['window'] = model_params['window']
tags['lag_type'] = model_params['lag_type']
tags['domain'] = 'production'
tags['predict'] = 'quality'

In [18]:
tags

{'model': 'Baseline Average',
 'model_type': 'dev',
 'target_type': 'regression',
 'seed': 777,
 'version': 1,
 'window': 5,
 'lag_type': 'm',
 'domain': 'production',
 'predict': 'quality'}

In [19]:
with mlflow.start_run(run_id=run_cl.info.run_id, experiment_id=exp_id_cl, run_name='Baseline Average'):
    mlflow.set_tags(tags)
    mlflow.log_param('value', average)
    
    for m in [metrics_avg_train, metrics_avg_val, metrics_avg_test]:
        mlflow.log_metric(m[1]+'_R2', round(m[2], 4))
        mlflow.log_metric(m[1]+'_MAE', round(m[3], 4))
        mlflow.log_metric(m[1]+'_MSE', round(m[4], 4))
        mlflow.log_metric(m[1]+'_MedAE', round(m[5], 4))
        mlflow.log_metric(m[1]+'_MSLE', round(m[6], 4))

# Random Forest Model

In [20]:
rf = RandomForestRegressor(random_state=model_params['seed'])
rf.fit(X_train, y_train)
y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)
y_test_pred = rf.predict(X_test)

In [21]:
metrics_rf_train = get_metrics(y_train, y_train_pred, set_='train', model_=' RandomForestRegressor')
metrics_rf_val = get_metrics(y_val, y_val_pred, set_='val', model_=' RandomForestRegressor')
metrics_rf_test = get_metrics(y_test, y_test_pred, set_='test', model_=' RandomForestRegressor')

Metrics for model:  RandomForestRegressor on train dataset. 
R^2:  0.9884145236955612
Mean absolute error:  3.644838026556777
Mean squared error:  24.550350085851647
Median absolute error:  2.75
Mean squared log error:  0.00017881318183469205
Metrics for model:  RandomForestRegressor on val dataset. 
R^2:  0.9178201793922934
Mean absolute error:  9.866874429223746
Mean squared error:  175.44984244292237
Median absolute error:  7.6299999999999955
Mean squared log error:  0.0012425420661512818
Metrics for model:  RandomForestRegressor on test dataset. 
R^2:  0.9099533846951381
Mean absolute error:  10.478573848238483
Mean squared error:  204.7973435298103
Median absolute error:  8.0
Mean squared log error:  0.001531656613861809


In [22]:
kf = KFold(n_splits=3, random_state=17)

In [23]:
params = {
    'n_estimators':range(75, 250, 5),
    'max_depth': range(2, 7, 1),
    #'min_samples_leaf':range(1, 1000, 100)
}

random_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=model_params['seed']), param_distributions=params, 
    cv=kf, verbose=1, n_jobs=-1, n_iter=100, scoring='r2'
)
random_search.fit(y_train_pred.reshape(-1, 1), y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   41.6s finished


RandomizedSearchCV(cv=KFold(n_splits=3, random_state=17, shuffle=False),
                   estimator=RandomForestRegressor(random_state=777),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': range(2, 7),
                                        'n_estimators': range(75, 250, 5)},
                   scoring='r2', verbose=1)

In [24]:
random_search.best_params_

{'n_estimators': 145, 'max_depth': 6}

In [25]:
rf = RandomForestRegressor(random_state=model_params['seed'], **random_search.best_params_)
rf.fit(X_train, y_train)
y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)
y_test_pred = rf.predict(X_test)

In [26]:
metrics_rf_train = get_metrics(y_train, y_train_pred, set_='train', model_=' RandomForestRegressor')
metrics_rf_val = get_metrics(y_val, y_val_pred, set_='val', model_=' RandomForestRegressor')
metrics_rf_test = get_metrics(y_test, y_test_pred, set_='test', model_=' RandomForestRegressor')

Metrics for model:  RandomForestRegressor on train dataset. 
R^2:  0.8799111344416433
Mean absolute error:  12.264063011256933
Mean squared error:  254.47582933995048
Median absolute error:  9.84402941047594
Mean squared log error:  0.001820437741046928
Metrics for model:  RandomForestRegressor on val dataset. 
R^2:  0.8707392927854453
Mean absolute error:  12.790802361553403
Mean squared error:  275.9652010329117
Median absolute error:  10.250552011072983
Mean squared log error:  0.0019227486924722043
Metrics for model:  RandomForestRegressor on test dataset. 
R^2:  0.8608424307048848
Mean absolute error:  13.675101691464587
Mean squared error:  316.49274575416894
Median absolute error:  11.056245159761573
Mean squared log error:  0.0023571129192894204


In [27]:
run_cl = client.create_run(experiment_id=exp_id_cl, start_time=None, tags=None, )

In [28]:
tags = dict()
tags['model'] = 'Random Forest Regressor'
tags['model_type'] = model_params['model_type']
tags['target_type'] = model_params['target_type']
tags['seed'] = model_params['seed']
tags['version'] = model_params['version']
tags['window'] = model_params['window']
tags['lag_type'] = model_params['lag_type']
tags['domain'] = 'production'
tags['predict'] = 'quality'

In [29]:
with mlflow.start_run(run_id=run_cl.info.run_id, experiment_id=exp_id_cl, run_name='RFR'):
    mlflow.set_tags(tags)
    mlflow.log_params(rf.get_params())
    
    for m in [metrics_rf_train, metrics_rf_val, metrics_rf_test]:
        mlflow.log_metric(m[1]+'_R2', round(m[2], 4))
        mlflow.log_metric(m[1]+'_MAE', round(m[3], 4))
        mlflow.log_metric(m[1]+'_MSE', round(m[4], 4))
        mlflow.log_metric(m[1]+'_MedAE', round(m[5], 4))
        mlflow.log_metric(m[1]+'_MSLE', round(m[6], 4))
    
    mlflow.sklearn.log_model(rf, "sk_rf")

# Gradient Boosting Model

In [30]:
gb = GradientBoostingRegressor(random_state=model_params['seed'])
gb.fit(X_train, y_train)
y_train_pred = gb.predict(X_train)
y_val_pred = gb.predict(X_val)
y_test_pred = gb.predict(X_test)

In [31]:
metrics_gb_train = get_metrics(y_train, y_train_pred, set_='train', model_='GradientBoostingRegressor')
metrics_gb_val = get_metrics(y_val, y_val_pred, set_='val', model_='GradientBoostingRegressor')
metrics_gb_test = get_metrics(y_test, y_test_pred, set_='test', model_='GradientBoostingRegressor')

Metrics for model: GradientBoostingRegressor on train dataset. 
R^2:  0.9173565222065103
Mean absolute error:  9.896935866829123
Mean squared error:  175.12670682042727
Median absolute error:  7.544363446355447
Mean squared log error:  0.0012382936057020442
Metrics for model: GradientBoostingRegressor on val dataset. 
R^2:  0.9037206218681868
Mean absolute error:  10.559273538125044
Mean squared error:  205.55169868726964
Median absolute error:  7.938206553764616
Mean squared log error:  0.0014406028889645832
Metrics for model: GradientBoostingRegressor on test dataset. 
R^2:  0.8998507627545236
Mean absolute error:  11.179455468666914
Mean squared error:  227.7742219956921
Median absolute error:  8.611114537091794
Mean squared log error:  0.001677645642826402


In [32]:
params = {
    'n_estimators':range(75, 250, 5),
    'max_depth': range(2, 7, 1),
    #'min_samples_leaf':range(1, 1000, 100)
}

random_search = RandomizedSearchCV(
    GradientBoostingRegressor(random_state=model_params['seed']), param_distributions=params, 
    cv=kf, verbose=1, n_jobs=-1, n_iter=100, scoring='r2'
)
random_search.fit(y_train_pred.reshape(-1, 1), y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   39.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.0min finished


RandomizedSearchCV(cv=KFold(n_splits=3, random_state=17, shuffle=False),
                   estimator=GradientBoostingRegressor(random_state=777),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': range(2, 7),
                                        'n_estimators': range(75, 250, 5)},
                   scoring='r2', verbose=1)

In [33]:
random_search.best_params_

{'n_estimators': 75, 'max_depth': 3}

In [34]:
gb = GradientBoostingRegressor(random_state=model_params['seed'], **random_search.best_params_)
gb.fit(X_train, y_train)
y_train_pred = gb.predict(X_train)
y_val_pred = gb.predict(X_val)
y_test_pred = gb.predict(X_test)

In [35]:
metrics_gb_train = get_metrics(y_train, y_train_pred, set_='train', model_='GradientBoostingRegressor')
metrics_gb_val = get_metrics(y_val, y_val_pred, set_='val', model_='GradientBoostingRegressor')
metrics_gb_test = get_metrics(y_test, y_test_pred, set_='test', model_='GradientBoostingRegressor')

Metrics for model: GradientBoostingRegressor on train dataset. 
R^2:  0.911841896635885
Mean absolute error:  10.23741028057228
Mean squared error:  186.81254388000258
Median absolute error:  7.796444752264506
Mean squared log error:  0.0013266994589074274
Metrics for model: GradientBoostingRegressor on val dataset. 
R^2:  0.8994424452144043
Mean absolute error:  10.843474980646679
Mean squared error:  214.685393726983
Median absolute error:  8.243655698611974
Mean squared log error:  0.0015069243188427965
Metrics for model: GradientBoostingRegressor on test dataset. 
R^2:  0.8954850426445785
Mean absolute error:  11.45247296456547
Mean squared error:  237.70338899530006
Median absolute error:  8.860884947509618
Mean squared log error:  0.0017621317409739815


In [36]:
run_cl = client.create_run(experiment_id=exp_id_cl, start_time=None, tags=None, )

In [37]:
tags = dict()
tags['model'] = 'Gradient Boosting Regressor'
tags['model_type'] = model_params['model_type']
tags['target_type'] = model_params['target_type']
tags['seed'] = model_params['seed']
tags['version'] = model_params['version']
tags['window'] = model_params['window']
tags['lag_type'] = model_params['lag_type']
tags['domain'] = 'production'
tags['predict'] = 'quality'

In [38]:
with mlflow.start_run(run_id=run_cl.info.run_id, experiment_id=exp_id_cl, run_name='GBR'):
    mlflow.set_tags(tags)
    mlflow.log_params(gb.get_params())
    
    for m in [metrics_gb_train, metrics_gb_val, metrics_gb_test]:
        mlflow.log_metric(m[1]+'_R2', round(m[2], 4))
        mlflow.log_metric(m[1]+'_MAE', round(m[3], 4))
        mlflow.log_metric(m[1]+'_MSE', round(m[4], 4))
        mlflow.log_metric(m[1]+'_MedAE', round(m[5], 4))
        mlflow.log_metric(m[1]+'_MSLE', round(m[6], 4))
    
    mlflow.sklearn.log_model(rf, "sk_gb")

# LightGBM Model

In [39]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

In [40]:
lgb_params = [
    {
     'boosting_type': ['gbdt'],
     'num_leaves': range(20, 60, 5),
     'reg_alpha': [0.01, 0.1, 1, 10],
     'learning_rate': [0.01, 0.025, 0.05, 0.75, 0.1], 
     'n_estimators': range(100, 5000, 50),
    } 
]

lgbm = lgb.LGBMRegressor()
lgb_grid = RandomizedSearchCV(lgbm, lgb_params, cv=kf, verbose=1, n_jobs=-1, n_iter=100, scoring='r2')
lgb_grid.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 26.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 45.1min finished


RandomizedSearchCV(cv=KFold(n_splits=3, random_state=17, shuffle=False),
                   estimator=LGBMRegressor(), n_iter=100, n_jobs=-1,
                   param_distributions=[{'boosting_type': ['gbdt'],
                                         'learning_rate': [0.01, 0.025, 0.05,
                                                           0.75, 0.1],
                                         'n_estimators': range(100, 5000, 50),
                                         'num_leaves': range(20, 60, 5),
                                         'reg_alpha': [0.01, 0.1, 1, 10]}],
                   scoring='r2', verbose=1)

In [41]:
lgb_grid.best_params_

{'reg_alpha': 0.01,
 'num_leaves': 50,
 'n_estimators': 1900,
 'learning_rate': 0.01,
 'boosting_type': 'gbdt'}

In [42]:
lgb_grid.best_score_

0.9131434982476158

In [43]:
lgbm_regr = lgb.LGBMRegressor(**lgb_grid.best_params_)
lgbm_regr.fit(X_train, y_train)
y_train_pred = lgbm_regr.predict(X_train)
y_val_pred = lgbm_regr.predict(X_val)
y_test_pred = lgbm_regr.predict(X_test)

In [None]:
lgbm_regr = lgb.LGBMRegressor(**lgb_grid.best_params_)
lgbm_regr.fit(X_train, y_train)
y_train_pred = lgbm_regr.predict(X_train)
y_val_pred = lgbm_regr.predict(X_val)
y_test_pred = lgbm_regr.predict(X_test)

In [45]:
metrics_lgbm_train = get_metrics(y_train, y_train_pred, set_='train', model_='LightGBM')
metrics_lgbm_val = get_metrics(y_val, y_val_pred, set_='val', model_='LightGBM')
metrics_lgbm_test = get_metrics(y_test, y_test_pred, set_='test', model_='LightGBM')

Metrics for model: LightGBM on train dataset. 
R^2:  0.970158520098653
Mean absolute error:  6.024470395658478
Mean squared error:  63.235965393781576
Median absolute error:  4.683212316208369
Mean squared log error:  0.00042965754462126764
Metrics for model: LightGBM on val dataset. 
R^2:  0.9200682207464059
Mean absolute error:  9.64368156201436
Mean squared error:  170.65038561194444
Median absolute error:  7.342063539585041
Mean squared log error:  0.0012002887993689262
Metrics for model: LightGBM on test dataset. 
R^2:  0.9147001222935613
Mean absolute error:  10.15145948054156
Mean squared error:  194.00161014994995
Median absolute error:  7.66177657528209
Mean squared log error:  0.0014421038955664735


In [58]:
run_cl = client.create_run(experiment_id=exp_id_cl, start_time=None, tags=None, )

In [59]:
tags = dict()
tags['model'] = 'LightGBM'
tags['model_type'] = model_params['model_type']
tags['target_type'] = model_params['target_type']
tags['seed'] = model_params['seed']
tags['version'] = model_params['version']
tags['window'] = model_params['window']
tags['lag_type'] = model_params['lag_type']
tags['domain'] = 'production'
tags['predict'] = 'quality'

In [60]:
with mlflow.start_run(run_id=run_cl.info.run_id, experiment_id=exp_id_cl, run_name='GBR'):
    mlflow.set_tags(tags)
    mlflow.log_params(lgbm_regr.get_params())
    
    for m in [metrics_lgbm_train, metrics_lgbm_val, metrics_lgbm_test]:
        mlflow.log_metric(m[1]+'_R2', round(m[2], 4))
        mlflow.log_metric(m[1]+'_MAE', round(m[3], 4))
        mlflow.log_metric(m[1]+'_MSE', round(m[4], 4))
        mlflow.log_metric(m[1]+'_MedAE', round(m[5], 4))
        mlflow.log_metric(m[1]+'_MSLE', round(m[6], 4))
    
    mlflow.lightgbm.save_model(lgbm_regr.booster_, "LightGBM")