In [None]:
! pip install catboost
! pip install -U imbalanced-learn

[0mCollecting catboost
  Downloading catboost-1.0.4-cp39-none-manylinux1_x86_64.whl (76.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.2/76.2 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting graphviz
  Downloading graphviz-0.19.1-py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.3/46.3 KB[0m [31m118.9 MB/s[0m eta [36m0:00:00[0m
Collecting plotly
  Downloading plotly-5.6.0-py2.py3-none-any.whl (27.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.7/27.7 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, graphviz, plotly, catboost
Successfully installed catboost-1.0.4 graphviz-0.19.1 plotly-5.6.0 tenacity-8.0.1
[0mCollecting imbalanced-learn
  Downloading imbalanced_learn-0.9.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━

# Environment Setup

In [None]:
# Essential modules for data manipulation
import pandas as pd
import numpy as np

# Custom modules to assist the commom data exploration and preparation tasks
import src.data.sets as datasets

# Custom module to produced Performance metrics
import src.models.performance as perf

# Classifiers
from catboost import CatBoostClassifier

# Time related modules
from datetime import datetime
import pytz

# Declare variables to store name of timezone
tz_SYD = pytz.timezone('Australia/Sydney')

# 3. Modelling

## 3.1 Collect processed data

In [None]:
# Load data set(s) into dataframe(s)
X_train, y_train, X_val, y_val, X_test, y_test = datasets.load_sets()

In [None]:
# Print the shape of the loaded datasets to verify that correct data has been loaded
print("Train Dataframe (rows, columns): ", X_train.shape)
print("Validation Dataframe (rows, columns): ", X_val.shape)
print("Test Dataframe (rows, columns): ", X_test.shape)

Train Dataframe (rows, columns):  (467746, 6)
Validation Dataframe (rows, columns):  (155916, 6)
Test Dataframe (rows, columns):  (155916, 6)


## 3.2 CatBoost Model

### 3.2.1 Training with default hyperparameters

In [None]:
print(datetime.now(tz_SYD))
# Instantiate CatBoost Classifier with default Hyperparams
clf_cb1=CatBoostClassifier (task_type='GPU', loss_function='MultiClass', random_state=8)
# Fit CatBoost Classifier
clf_cb1.fit(X_train,y_train)
print(datetime.now(tz_SYD))
# Score CatBoost Classifier
perf.score_models(X_train, y_train, X_val, y_val, X_test, y_test, None, False, "multiclass", clf_cb1)
print(datetime.now(tz_SYD))

2022-03-20 20:21:55.940057+11:00
Learning rate set to 0.227492
0:	learn: 4.0485691	total: 699ms	remaining: 11m 38s
1:	learn: 3.8428087	total: 1.4s	remaining: 11m 38s
2:	learn: 3.7358270	total: 2.1s	remaining: 11m 39s
3:	learn: 3.6563477	total: 2.81s	remaining: 11m 38s
4:	learn: 3.5970866	total: 3.51s	remaining: 11m 38s
5:	learn: 3.5550515	total: 4.2s	remaining: 11m 36s
6:	learn: 3.5153662	total: 4.89s	remaining: 11m 33s
7:	learn: 3.4797566	total: 5.58s	remaining: 11m 31s
8:	learn: 3.4548762	total: 6.28s	remaining: 11m 31s
9:	learn: 3.4291001	total: 6.97s	remaining: 11m 30s
10:	learn: 3.4036205	total: 7.66s	remaining: 11m 29s
11:	learn: 3.3897954	total: 8.36s	remaining: 11m 28s
12:	learn: 3.3716253	total: 9.05s	remaining: 11m 27s
13:	learn: 3.3591052	total: 9.73s	remaining: 11m 25s
14:	learn: 3.3453464	total: 10.4s	remaining: 11m 24s
15:	learn: 3.3366293	total: 11.1s	remaining: 11m 23s
16:	learn: 3.3203574	total: 11.8s	remaining: 11m 22s
17:	learn: 3.3078367	total: 12.5s	remaining: 11m 

Unnamed: 0,Set Name,ACC,PREC,RECALL,F1
0,Train,0.523468,0.567029,0.501248,0.524383
1,Validate,0.497492,0.523172,0.459632,0.481543
2,Test,0.493407,0.522144,0.458979,0.481213


2022-03-20 20:33:42.544732+11:00


### 3.2.2 Training with Auto-Class-Weights=Balanced

In [None]:
print(datetime.now(tz_SYD))
# Instantiate CatBoost Classifier with auto_class_weights=Balanced
clf_cb2=CatBoostClassifier (task_type='GPU', loss_function='MultiClass', auto_class_weights='Balanced', random_state=8)
# Fit CatBoost Classifier
clf_cb2.fit(X_train,y_train)
print(datetime.now(tz_SYD))
# Score CatBoost Classifier
perf.score_models(X_train, y_train, X_val, y_val, X_test, y_test, None, False, "multiclass", clf_cb2)
print(datetime.now(tz_SYD))

2022-03-20 20:37:53.179260+11:00
Learning rate set to 0.227492


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 4.2911195	total: 693ms	remaining: 11m 32s
1:	learn: 4.3200194	total: 1.39s	remaining: 11m 31s
2:	learn: 4.4298911	total: 2.08s	remaining: 11m 32s
3:	learn: 4.3541905	total: 2.78s	remaining: 11m 32s
4:	learn: 5.9743874	total: 3.49s	remaining: 11m 34s
5:	learn: 6.1912310	total: 4.18s	remaining: 11m 31s
6:	learn: 8.2493704	total: 4.87s	remaining: 11m 30s
7:	learn: 7.6074150	total: 5.57s	remaining: 11m 30s
8:	learn: 7.2655282	total: 6.27s	remaining: 11m 30s
9:	learn: 7.1926770	total: 6.97s	remaining: 11m 30s
10:	learn: 6.8469298	total: 7.69s	remaining: 11m 31s
11:	learn: 6.8226488	total: 8.37s	remaining: 11m 29s
12:	learn: 5.7791583	total: 9.06s	remaining: 11m 27s
13:	learn: 6.7638469	total: 9.75s	remaining: 11m 26s
14:	learn: 5.6387485	total: 10.5s	remaining: 11m 27s
15:	learn: 5.6209231	total: 11.2s	remaining: 11m 26s
16:	learn: 5.9556285	total: 11.9s	remaining: 11m 25s
17:	learn: 6.0569196	total: 12.6s	remaining: 11m 24s
18:	learn: 6.0275768	total: 13.2s	remaining: 11m 24s
19:

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Set Name,ACC,PREC,RECALL,F1
0,Train,0.111883,0.135127,0.187853,0.106959
1,Validate,0.113048,0.133376,0.18975,0.107472
2,Test,0.110989,0.133605,0.186424,0.105874


2022-03-20 20:49:51.088332+11:00
