In [1]:
# Import all required packages
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV
from scipy.stats import randint,uniform
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, PrecisionRecallDisplay, average_precision_score,precision_recall_curve,auc
import seaborn as sn
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree  import DecisionTreeClassifier
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from joblib import Parallel
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures,StandardScaler,MinMaxScaler
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import pipeline,set_config
from sklearn.utils import class_weight
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, PrecisionRecallDisplay, precision_score, \
    average_precision_score, auc, precision_recall_curve, make_scorer,SCORERS
from skopt import BayesSearchCV
import datetime, re, sys, holidays
from Functions import *
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

set_config(display="diagram")
np.random.seed (0)
Parallel(n_jobs=8, max_nbytes=50000)

warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


In [2]:
categorical_cols = ['Vessel Type']
numeric_cols = ['Waiting Time (Days)']
vessel_cols = ['Total Vessels (Number)', 'Capesize',
                'Capesize Laden', 'Capesize Unladen', 'Panamax', 'Panamax Laden',
                'Panamax Unladen', 'Handymax', 'Handymax Laden', 'Handymax Unladen',
                'Handysize', 'Handysize Laden', 'Handysize Unladen']
# date_cols = ['Berth or Port Entry', 'Date']
date_cols = ['Date']
dwt_cols = ['Total Vessels (DWT)','Capesize (DWT)',
            'Capesize Laden (DWT)', 'Capesize Unladen (DWT)', 'Panamax (DWT)',
            'Panamax Laden (DWT)', 'Panamax Unladen (DWT)', 'Handymax (DWT)',
            'Handymax Laden (DWT)', 'Handymax Unladen (DWT)', 'Handysize (DWT)',
            'Handysize Laden (DWT)', 'Handysize Unladen (DWT)']
weather_cols = ['tempt', 'prcp', 'wspd']

predict_col = 'Wait Levels'

In [3]:
# Get data
data = pd.read_csv('clean_data.csv', index_col=0, parse_dates=date_cols)
weather_df = pd.read_csv('sao_paulo_weather.csv', parse_dates=['date'])
weather_df = weather_df[['date','tavg','prcp','wspd']].rename(columns={'date': 'Date',
                   'tavg': 'tempt',
                   'prcp': 'prcp',
                   'wspd': 'wspd'}, inplace=False)
weather_df = weather_df.fillna(0)
data = data.merge(weather_df, how='inner',left_on=['Date'], right_on=['Date'])

# Drop cols with only 1 value
for col in data.columns:
    if len(data[col].unique()) == 1:
        data.drop(col,axis=1, inplace=True)

data.drop('Berth or Port Entry',axis=1, inplace=True)

#convert DWT cols from obj to int
for col in dwt_cols:
    data[col] = data[col].str.replace(",","").astype(int)
#convert vessel_cols from float to int
for col in vessel_cols:
    data[col] = data[col].astype(int)

#print cleaned data
data.head()

Unnamed: 0,IMO,Waiting Time (Days),Vessel Type,Date,Total Vessels (Number),Total Vessels (DWT),Capesize,Capesize Laden,Capesize Unladen,Panamax,...,Handymax (DWT),Handymax Laden (DWT),Handymax Unladen (DWT),Handysize (DWT),Handysize Laden (DWT),Handysize Unladen (DWT),Wait Levels,tempt,prcp,wspd
0,9056399,0.2,3,2015-01-02,16,878272,0,0,0,5,...,432704,121785,310919,52823,30060,22763,0,27.0,0.0,15.1
1,9471252,3.4,2,2015-01-02,16,878272,0,0,0,5,...,432704,121785,310919,52823,30060,22763,1,27.0,0.0,15.1
2,9233399,5.6,3,2015-01-02,16,878272,0,0,0,5,...,432704,121785,310919,52823,30060,22763,2,27.0,0.0,15.1
3,8309141,0.5,1,2015-01-03,17,908718,0,0,0,5,...,474533,107174,367359,70297,36036,34261,0,23.3,3.0,13.8
4,9625970,3.9,2,2015-01-03,17,908718,0,0,0,5,...,474533,107174,367359,70297,36036,34261,1,23.3,3.0,13.8


In [4]:
data.to_csv('data')

In [5]:
# split to train & Test
X_train, X_test, y_train, y_test = train_test_split(data.drop(predict_col,axis=1, inplace=False), data[predict_col], test_size=0.9, random_state=0)

Create Pipeline

In [17]:
date_linear_processor = pipeline.Pipeline([('ProcessDates',ProcessDates()),('ProcessHolidays',ProcessHolidays()),('DropDate',DropCol(cols=date_cols))])
numeric_linear_processor = pipeline.Pipeline([('Polynomial',PolynomialFeatures(degree=2,interaction_only =True,include_bias=True)),('Scaler',StandardScaler())])
cat_linear_processor = OneHotEncoder()
vessel_linear_processor = pipeline.Pipeline([('CreateDummyCol',CreateDummyCol()),('ProcessVessel',ProcessVessel(vessel_cols=vessel_cols,dwt_cols=dwt_cols))])
weather_linear_processor = pipeline.Pipeline([('CreateDummyCol',CreateDummyCol()),('ProcessWeather',ProcessWeather()),('ProcessWSpd',ProcessWSpd())])

data_preprocessor = ColumnTransformer([
    ('numeric_linear_processor',numeric_linear_processor, numeric_cols),
    ('cat_linear_processor',cat_linear_processor, categorical_cols),
    ('date_linear_processor',date_linear_processor, date_cols),
    ('vessel_linear_processor',vessel_linear_processor, vessel_cols+dwt_cols),
    ('weather_linear_processor',weather_linear_processor, weather_cols)
])
data_preprocessor


In [18]:
finalPipe = pipeline.make_pipeline(data_preprocessor,DropZeroCol())
# dp = finalPipe()
finalPipe.set_params(**{
    'columntransformer__weather_linear_processor__ProcessWSpd__action': 'drop',
    'columntransformer__weather_linear_processor__ProcessWeather__action': 'drop'
})
X = finalPipe.fit_transform(data)
X = pd.DataFrame(X)
finalPipe.fit_transform(data)

array([[0.0, -0.8960064549921882, 0.0, ..., 52823, 30060, 22763],
       [0.0, -0.5250929535052529, 0.0, ..., 52823, 30060, 22763],
       [0.0, -0.270089921232985, 0.0, ..., 52823, 30060, 22763],
       ...,
       [0.0, -0.5830481881125865, 0.0, ..., 465851, 308646, 157205],
       [0.0, -0.8496422673063214, 0.0, ..., 458868, 341163, 117705],
       [0.0, -0.7221407511701873, 0.0, ..., 479429, 301875, 177554]],
      dtype=object)

In [15]:
X['dummy'] = 0
X

Unnamed: 0,0,1,2,dummy
0,27.0,0.0,15.1,0
1,27.0,0.0,15.1,0
2,27.0,0.0,15.1,0
3,23.3,3.0,13.8,0
4,23.3,3.0,13.8,0
...,...,...,...,...
9942,17.6,0.0,17.3,0
9943,21.6,0.0,9.8,0
9944,21.6,0.0,9.8,0
9945,20.3,0.0,10.6,0


Regressor model

In [8]:
param_grid = {
    ## Preprocessing parameter options
    'columntransformer__date_linear_processor__ProcessDates': [ProcessDates(),None],
    'columntransformer__date_linear_processor__ProcessHolidays': [ProcessHolidays(),None],
    'columntransformer__weather_linear_processor__ProcessWeather__action': ['actual','drop'],
    'columntransformer__weather_linear_processor__ProcessWSpd__action': ['actual','cat','drop'],
    'columntransformer__vessel_linear_processor__ProcessWSpd__action': ['actual','cat','drop'],
    'columntransformer__vessel_linear_processor__ProcessVessel__usage': ['both','dwt','vessel'],


    ## ElasticNet hyper parameters
    'ElasticNet__regressor__alpha': np.linspace(0, 1, num=10),
    'ElasticNet__regressor__l1_ratio': np.linspace(0, 1, num=10),

    ## XGBoost hyper parameters
    'XGBoost__regressor__reg_lambda': [0,0.1,0.5,1,5],
    'XGBoost__regressor__n_estimators': [200,400,600,800,1200],
    'XGBoost__regressor__learning_rate': np.linspace(0.05, 0.5, num=5),
    'XGBoost__regressor__max_depth': [-1,5,10,20,30,50,100],

    ## RandomForest hyper parameters
    'RandomForest__regressor__min_impurity_decrease': np.linspace(0.1, 0.5, num=4),
    'RandomForest__regressor__min_samples_leaf': [2,5,10,20,50],
    'RandomForest__regressor__min_samples_split': [2,5,10,20],
    'RandomForest__regressor__n_estimators': range(100,400,100),

    ## LGBM hyper parameters
    'LGBM__regressor__boosting_type': ['dart','goss','gbdt'],
    'LGBM__regressor__learning_rate': np.linspace(0.05, 0.5, num=5),
    'LGBM__regressor__max_depth': [-1,5,10,20,30,50,100],
    'LGBM__regressor____n_estimators': [200,400,600,800,1200],
}

In [9]:
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm  import LGBMRegressor
from sklearn.ensemble import StackingRegressor
estimators = [
    ("ElasticNet", ElasticNet( random_state=0)),
    ("XGBoost", XGBRegressor(objective='reg:squarederror',eval_metric='rmse',random_state=0,n_jobs=-1,use_label_encoder=False,  tree_method = 'gpu_hist')),
    ("RandomForest", RandomForestRegressor(random_state=0,n_jobs=-1)),
    ("LGBM", LGBMRegressor(objective='regression',random_state=0,n_jobs=-1,device = 'gpu')),
]
finalestimators = []
for estimator, model in estimators:
    finalestimators.append((estimator,pipeline.Pipeline([('columntransformer',data_preprocessor),('DropZeroCol',DropZeroCol()),('regressor',model)])))
final_estimator=LGBMRegressor(objective='regression',random_state=0,n_jobs=-1,device = 'gpu')
stacking_Regressor = StackingRegressor(estimators=finalestimators, final_estimator=final_estimator,n_jobs=-1)
# finalPipe = pipeline.make_pipeline(data_preprocessor,stacking_Regressor)
stacking_Regressor

In [10]:
stacking_Regressor.get_params()

{'cv': None,
 'estimators': [('LinearRegression',
   Pipeline(steps=[('columntransformer',
                    ColumnTransformer(transformers=[('cat_linear_processor',
                                                     OneHotEncoder(),
                                                     ['Vessel Type']),
                                                    ('date_linear_processor',
                                                     Pipeline(steps=[('ProcessDates',
                                                                      ProcessDates()),
                                                                     ('ProcessHolidays',
                                                                      ProcessHolidays()),
                                                                     ('DropDate',
                                                                      DropDate(date_cols=['Date']))]),
                                                     ['Date']),
            

In [45]:
#tune hyperparameter
limitTuner = 'ElasticNet'
preprocessingName = 'columntransformer__'
estimatorsParam = {}
finalParam = {}
for estimator in estimators:
    if limitTuner== '' or estimator[0] ==limitTuner:
        modelParam = {}
        for param, grid in param_grid.items():
            if param.startswith(estimator[0]):
                modelParam[param] = grid
            elif param.startswith(preprocessingName) :
                modelParam[estimator[0]+'__'+param] = grid
        print('Tuning Model ',estimator[0], "with params:\n",  modelParam, '\n')
        gs = GridSearchCV(estimator=estimator[1],param_grid=modelParam,scoring = 'rmse',n_jobs = -1,cv = 5,verbose = -1)
        gs.fit(X_train,X_test)
        print('Tuned Model ',estimator[0], " (RMSE = ","{:.4f}".format(gs.best_score_),")")
        # print("Best params:\n",  gs.best_params_, '\n')
        estimatorsParam ={**estimatorsParam, **gs.best_params_}

print('\n Params to use\n\n',estimatorsParam)

Tuning Model  LinearRegression with params:
 {'LinearRegression__columntransformer__numeric_linear_processor__Polynomial__degree': [1, 2, 3], 'LinearRegression__columntransformer__numeric_linear_processor__Polynomial__include_bias': [True, False], 'LinearRegression__columntransformer__numeric_linear_processor__Polynomial__interaction_only': [True, False]} 

Tuned Model  LinearRegression  (RMSE =  0.1245 )

 Params to use

 {'LinearRegression__max_depth': 4, 'LinearRegression__n_estimators': 60, 'columntransformer__numeric_linear_processor__Polynomial__degree': 3, 'columntransformer__numeric_linear_processor__Polynomial__include_bias': True, 'columntransformer__numeric_linear_processor__Polynomial__interaction_only': True}


In [29]:
# replace individual best model here post tuning.
elasticnetParams = {}
xgboostParams = {}
randomforestParams = {}
lgbmParams = {}


In [47]:
# prepare data for final tuning
estimatorsParam = {**elasticnetParams,**xgboostParams,**randomforestParams,**lgbmParams}
finalPipe.set_params(**estimatorsParam)
finalTrain  = finalPipe.fit_transform()
finalGrid = {}

# generate final model params
for param, grid in param_grid.items():
    if param.startswith("final_estimator"):
        finalGrid[param[len("final_estimator__"):]] = grid

#tune final model
gs = GridSearchCV(estimator=final_estimator,param_grid=finalGrid,scoring = 'rmse',n_jobs = -1,cv = 5,verbose = -1)
gs.fit(finalTrain)
modelParam = {}
for param, grid in gs.best_params_:
    modelParam["final_estimator__"+param] = grid
finalParam = {**estimatorsParam, **modelParam}

NameError: name 'finalTrain' is not defined

Old classifier model

estimators = [
    ("LightGBM", LGBMClassifier(objective='multiclass',random_state=0,n_jobs=-1,device = 'gpu',)),
    ("XGBoost", XGBClassifier(objective='multi:softmax',random_state=0,n_jobs=-1,use_label_encoder=False,  tree_method = 'gpu_hist')),
    ("RandomForest", RandomForestClassifier(random_state=0)),
    ("DecisionTree", DecisionTreeClassifier()),
]
final_estimator=LGBMClassifier(objective='multiclass',random_state=0,n_jobs=-1,device = 'gpu')
stacking_classifier = StackingClassifier(estimators=estimators, final_estimator=final_estimator,stack_method='predict_proba',n_jobs=-1)
finalPipe = pipeline.make_pipeline(data_preprocessor,stacking_classifier)
finalPipe

In [None]:
finalPipe.get_params()

In [None]:
param_grid = {
    'columntransformer__numeric_linear_processor__Polynomial__degree': [1,2,3],
    'columntransformer__numeric_linear_processor__Polynomial__include_bias': [True,False],
    'columntransformer__numeric_linear_processor__Polynomial__interaction_only': [True,False],

    'stackingclassifier__LightGBM__num_leaves': range(100,1500,300),
    'stackingclassifier__LightGBM__n_estimators': range(100,1500,300),
    'stackingclassifier__LightGBM__reg_lambda': [ 0.11, 1, 10, 50, 100, 150],

    'stackingclassifier__XGBoost__reg_lambda': [0, 1e-1, 1, 10, 50],
    'stackingclassifier__XGBoost__n_estimators': range(100,1500,300),
    'stackingclassifier__XGBoost__eta': [0.1,0.5,1.0],

    'stackingclassifier__RandomForest__max_depth': [10, 30, 50],
    'stackingclassifier__RandomForest__max_features': [5,10,15],
    'stackingclassifier__RandomForest__min_samples_leaf': [3, 5, 7],

    'stackingclassifier__DecisionTree__max_depth': [10, 30, 50],
    'stackingclassifier__DecisionTree__max_features': [5,10,15],
    'stackingclassifier__DecisionTree__min_samples_leaf': [3, 5, 7],

    # 'stackingclassifier__final_estimator__num_leaves': range(100,1500,300),
    # 'stackingclassifier__final_estimator__n_estimators': range(100,1500,300),
    # 'stackingclassifier__final_estimator__reg_lambda': [ 0.11, 1, 10, 50, 100, 150],
    # 'stackingclassifier__final_estimator__learning_rate': [0.1,0.5,1.0]

}
iter =50
rs = RandomizedSearchCV(
    estimator=finalPipe, param_distributions=param_grid,
    n_iter=iter,
    cv=5,
    refit=True,
    random_state=314,
    verbose=10,
    n_jobs=8,
    pre_dispatch=15)

rs.fit(X_train, y_train)

rs.best_estimator_

In [None]:
rs.best_params_

In [None]:
predict_test = rs.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
accuracy_test

In [None]:
predict_test

In [None]:
predict_prob_test = rs.predict_proba(X_test)
roc_auc_score(y_score=predict_prob_test, y_true=y_test, multi_class="ovr")