# <div style="font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:200%; text-align:center;padding:3.0px; background: #6A1B9A; border-bottom: 8px solid #9C27B0">S3E26 - BOOSTING (VOTING)</div>
Notebook edited from [dreygaen/ps3e25-cirrhosis-multi-class-solution](https://www.kaggle.com/code/dreygaen/ps3e25-cirrhosis-multi-class-solution)
#### <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:150%; text-align:left;padding:3.0px; background: #6A1B9A; border-bottom: 8px solid #9C27B0" >TABLE OF CONTENTS<br><div>
* [IMPORTS](#1)
* [LOAD DATA](#2)
* [EDA AND VISUALS](#3)
* [FEATURE ENGINEERING](#4)

<a id="1"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:120%; text-align:left;padding:3.0px; background: #6A1B9A; border-bottom: 8px solid #9C27B0" > IMPORTS<br><div> 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import time
import zipfile

from category_encoders import OneHotEncoder, MEstimateEncoder, GLMMEncoder, OrdinalEncoder, CatBoostEncoder
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, GridSearchCV, KFold, cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import QuantileRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import median_absolute_error, roc_auc_score, roc_curve, log_loss
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import FunctionTransformer, StandardScaler, MinMaxScaler, PowerTransformer, LabelEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.calibration import CalibratedClassifierCV
from scipy import stats
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier, early_stopping
from catboost import CatBoostRegressor, CatBoostClassifier
from sklego.linear_model import LADRegression
import optuna
from itertools import combinations, permutations

optuna.logging.set_verbosity(optuna.logging.WARNING)

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
tqdm.pandas()

rc = {
    'axes.facecolor': '#FFEDED',
    'figure.facecolor': '#FFEDED',
    'axes.edgecolor': '#000000',
    'grid.color': '#EBEBE7',
    'font.family': 'serif',
    'axes.labelcolor': '#000000',
    'xtick.color': '#000000',
    'ytick.color': '#000000',
    'grid.alpha': 0.4
}

font = {'family': 'serif',
        'color':  'black',
        'weight': 'bold',
        'size': 16,
        }

sns.set(rc=rc)

from colorama import Style, Fore
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
mgt = Style.BRIGHT + Fore.MAGENTA
gld = Style.BRIGHT + Fore.YELLOW
res = Style.RESET_ALL
bold_start = Style.BRIGHT
bold_end = Style.NORMAL

target_col = 'Status'

<a id="2"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:120%; text-align:left;padding:3.0px; background: #6A1B9A; border-bottom: 8px solid #9C27B0" > LOAD DATA<br><div> 

In [2]:
with zipfile.ZipFile('../../../res/data/playground-series-s3e26.zip', 'r') as z:
    
    with z.open('train.csv') as f:
        train = pd.read_csv(f, index_col = 'id')        
                
    with z.open('test.csv') as f:
        test = pd.read_csv(f, index_col = 'id')
    
    with z.open('sample_submission.csv') as f:
        submission = pd.read_csv(f)
        
with zipfile.ZipFile('../../../res/data/cirrhosis-patient-survival-prediction.zip', 'r') as z:
    with z.open('cirrhosis.csv') as f:
        orig_data = pd.read_csv(f, index_col = 'ID') 

In [3]:
train = pd.concat([train, orig_data], axis=0, ignore_index=True)

<a id="4"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:120%; text-align:left;padding:3.0px; background: #6A1B9A; border-bottom: 8px solid #9C27B0" > FEATURE ENGINEERING<br><div> 

In [4]:
# original notebook applied fit_transform on training data.
train_cat_cols = train.select_dtypes(include=['object']).columns
for col in train_cat_cols:
    label_encoder = LabelEncoder()
    train[col] = label_encoder.fit_transform(train[col])
    if col != target_col:
        test[col] = label_encoder.transform(test[col])

In [5]:
# train_drop = train.dropna()
# X = train_drop.drop([target_col], axis=1)
# y = train_drop[target_col]

X = train.drop([target_col], axis=1)
y = train[target_col]

<a id="5"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:120%; text-align:left;padding:3.0px; background: #6A1B9A; border-bottom: 8px solid #9C27B0" > MODELS<br><div> 

In [None]:

xgb_params = {
    'objective': 'multi_logloss', 
    # 'max_depth': 6, 
    # 'learning_rate': 0.010009541152584345, 
    # 'n_estimators': 1878,
    # 'min_child_weight': 9, 
    'colsample_bytree': 0.3292032860985591, 
    'reg_alpha': 0.10626128775335533, 
    'reg_lambda': 0.624196407787772, 
    'random_state': 42,
    'tree_method': 'hist', 
    'eval_metric': 'mlogloss',
    'subsample': 0.47524425009347593
}

cat_params = {
    # 'iterations':470,
    # 'depth': 20,
    # 'learning_rate': 0.138112945166,
    'l2_leaf_reg': 4.0368544113430485,
    'random_strength': 0.1279482215776108,
    # 'max_bin': 238,
    'od_wait': 49,
    'one_hot_max_size': 39,
    'grow_policy': 'Lossguide',
    'bootstrap_type': 'Bernoulli',
    'od_type': 'Iter',    
    'min_data_in_leaf': 11
}

lgbm_params = {
    'objective': 'multiclass', #'multi_logloss', 
    'num_class': 3,
    # 'max_depth': 9, 
    # 'min_child_samples': 14, 
    # 'learning_rate': 0.034869481921747415, 
    # 'n_estimators': 274, 
    'min_child_weight': 9, 
    'colsample_bytree': 0.1702910221565107, 
    'reg_alpha': 0.10626128775335533, 
    'reg_lambda': 0.624196407787772, 
    'random_state': 42,

}


def objective(trial):
    lgbm_max_depth=trial.suggest_int("lgbm_max_depth_uni", 5, 15, log=False)

    # n_estimators = trial.suggest_int("n_estimators_uni", 100, 1000, log=False)
    # List of potential classifiers
    estimators = [
        ("XGBClassifier", XGBClassifier(
                max_depth=trial.suggest_int("xgb_max_depth_uni", 5, 25, log=False),
                min_child_weight=trial.suggest_int("xgb_min_child_weight_uni", 1, 20, log=False),
                learning_rate=trial.suggest_float("xgb_learning_rate_uni", 0.005, 0.015, log=False),
                n_estimators=trial.suggest_int("xgb_n_estimators_uni", 100, 1000, log=False),
                **xgb_params
            )),
        ("CatBoostClassifier", CatBoostClassifier(
                depth=trial.suggest_int("cat_depth_uni", 15, 25, log=False),
                # max_depth=trial.suggest_int("cat_max_depth_uni", 5, 25, log=False),
                max_bin=trial.suggest_int("cat_max_bin_uni", 50, 500, log=False),
                learning_rate=trial.suggest_float("cat_learning_rate_uni", 0.1, 0.5, log=False),
                iterations=trial.suggest_int("cat_iterations_uni", 100, 1000, log=False),
                **cat_params
            )),
        ("LGBMClassifier", LGBMClassifier(
                max_depth=lgbm_max_depth,
                num_leaves=2**lgbm_max_depth - 1 if 2**lgbm_max_depth - 1 <= 131072 else 131072,
                min_child_samples=trial.suggest_int("lgbm_min_child_samples_uni", 10, 30, log=False),
                min_gain_to_split=trial.suggest_float("lgbm_min_gain_to_split2", 0.0, 0.1, step=0.01),
                learning_rate=trial.suggest_float("lgbm_learning_rate_uni", 0.02, 0.04, log=False),
                n_estimators=trial.suggest_int("lgbm_n_estimators_uni", 100, 1000, log=False),
                **lgbm_params
            )),
    ]
    voting = trial.suggest_categorical("voting", ['soft', 'hard'])

    classifier = VotingClassifier(estimators=estimators, voting=voting)
    score = cross_val_score(classifier, X, y, n_jobs=-1, cv=5, scoring='neg_log_loss')
    accuracy = score.mean()
    return accuracy

study = optuna.create_study(
    storage="sqlite:///db.sqlite3",
    study_name="voting-boosting-1",
    direction='maximize',
    load_if_exists=True,
)
study.optimize(objective, n_trials=10)

print(study.best_trial)


In [26]:
best_params = study.best_params

if best_params['base_classifiers'] == 'LGBMClassifier':
    max_depth = best_params['lgbm_max_depth_uni']
    
    model = LGBMClassifier(
        max_depth=max_depth,
        num_leaves=2**max_depth - 1 if 2**max_depth - 1 <= 131072 else 131072,
        min_child_samples=best_params["lgbm_min_child_samples_uni"],
        min_gain_to_split=best_params["lgbm_min_gain_to_split2"],
        **lgbm_params
    )

    model.fit(X, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000786 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1962
[LightGBM] [Info] Number of data points in the train set: 8323, number of used features: 18
[LightGBM] [Info] Start training from score -0.470941
[LightGBM] [Info] Start training from score -3.322996
[LightGBM] [Info] Start training from score -1.080160


<a id="6"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:120%; text-align:left;padding:3.0px; background: #6A1B9A; border-bottom: 8px solid #9C27B0" > SUBMISSION<br><div> 

In [38]:
submission['Status_C'] =  model.predict_proba(test)[:, 0]
submission['Status_D'] =  model.predict_proba(test)[:, 2]
submission['Status_CL'] = model.predict_proba(test)[:, 1]



In [36]:
test.info()
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5271 entries, 7905 to 13175
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   N_Days         5271 non-null   int64  
 1   Drug           5271 non-null   object 
 2   Age            5271 non-null   int64  
 3   Sex            5271 non-null   object 
 4   Ascites        5271 non-null   object 
 5   Hepatomegaly   5271 non-null   object 
 6   Spiders        5271 non-null   object 
 7   Edema          5271 non-null   object 
 8   Bilirubin      5271 non-null   float64
 9   Cholesterol    5271 non-null   float64
 10  Albumin        5271 non-null   float64
 11  Copper         5271 non-null   float64
 12  Alk_Phos       5271 non-null   float64
 13  SGOT           5271 non-null   float64
 14  Tryglicerides  5271 non-null   float64
 15  Platelets      5271 non-null   float64
 16  Prothrombin    5271 non-null   float64
 17  Stage          5271 non-null   float64
dtypes: float6

In [39]:
submission.to_csv('submission.csv', index = False)