# Multiclass Prediction Cirrhosis Outcomes - Third Attempt

### EDA Checklist

* What question(s) are you trying to solve (or prove wrong)?
    - What is our target and what do we want to accomplish 
* What kind of data do you have and how do you treat different types?
    - Numerical - Categorical (what is the best way to encode for this data ?)
* What’s missing from the data and how do you deal with it?
    - Imputing
* Where are the outliers and why should you care about them?
    - Visualization, distribution
* How can you add, change or remove features to get more out of your data?
    - Feature engineering

## Imports

In [1]:
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np 
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, MaxAbsScaler, LabelEncoder, StandardScaler
from sklearn.metrics import log_loss
from sklearn.ensemble import VotingClassifier
import optuna

# model imports
#xgboost
from xgboost import XGBClassifier

# lightGBM
from lightgbm import LGBMClassifier

# catboost
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")
print('imports finished')
print('imports done')

Matplotlib is building the font cache using fc-list. This may take a moment.


ImportError: cannot import name 'TypeAliasType' from 'typing_extensions' (/Users/manriqs/anaconda3/envs/multiclassPredictionCirrhosisOutcomes/lib/python3.7/site-packages/typing_extensions.py)

## Load Data

In [None]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')
print('data loaded')

In [None]:
# split the data
train_dataset, val_dataset = train_test_split(train_data, test_size=0.3, random_state=42)
print('data splitted')

In [None]:
# extract the labels
train_labels = train_dataset.pop('Status')
val_labels = val_dataset.pop('Status')
print('labels extracted')

print(train_labels.shape)
print(val_labels.shape)

In [None]:
# because the labels are in categorical values, we need to encode them
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.fit_transform(val_labels)

print('train labels encoded', f"{train_labels_encoded}")
print('val labels encoded', f"{val_labels_encoded}")

## EDA

### Basic Statistics

In [None]:
# general info
print(train_dataset.shape)
print(val_dataset.shape)
print(train_dataset.info)

In [None]:
train_dataset.head(15)

In [None]:
train_dataset.describe()

In [None]:
# Basic statistics for categorical values
ds_cat_stats = pd.DataFrame(columns = ['column', 'values', 'values_count_incna', 'values_count_nona', 'num_miss', 'pct_miss'])
tmp = pd.DataFrame()
for c in train_dataset.columns:
    tmp['column'] = [c]
    tmp['values'] = [train_dataset[c].unique()]
    tmp['values_count_incna'] = len(list(train_dataset[c].unique()))
    tmp['values_count_nona'] = int(train_dataset[c].nunique())
    tmp['num_miss'] = train_dataset[c].isnull().sum()
    tmp['pct_miss'] = (train_dataset[c].isnull().sum()/ len(train_dataset)).round(3)*100
    ds_cat_stats = pd.concat([ds_cat_stats, tmp], ignore_index=True)
ds_cat_stats

In [None]:
train_dataset.isna().sum()

### Types of Data

#### Numerical Data

In [None]:
# extract numerical data
num_cols = train_dataset.select_dtypes(exclude='object').columns.to_list()
num_data_df = train_dataset[num_cols]
num_data_df

In [None]:
# plot histograms
fig = plt.figure(figsize=(20, 40))
ax = []
for i, val in enumerate(num_cols):
    ax.append(plt.subplot(11, 4, i + 1))
    ax.append(sns.histplot(train_dataset[val], kde=True))
plt.show()
fig.tight_layout()

In [None]:
# plot scatterplots against id
fig = plt.figure(figsize=(20, 40))
ax = []
for i, val in enumerate(train_dataset):
    ax.append(fig.add_subplot(11, 4, i+1))
    ax.append(sns.scatterplot(x = val, y = 'id', data=train_dataset))
plt.show()
fig.tight_layout()

In [None]:
# visualize scatterplots agains target
train_labels_df = train_labels.to_frame(name='Status')
fig = plt.figure(figsize=(20, 40))
ax = []
for i, val in enumerate(num_cols):
    ax.append(fig.add_subplot(11, 4, i + 1))
    ax.append(sns.scatterplot(x=val, y='Status', data=pd.concat([num_data_df, train_labels_df])))
plt.show()
fig.tight_layout()

In [None]:
# visualize boxplots
fig = plt.figure(figsize=(20, 40))
ax = []
for i, val in enumerate(num_cols):
    ax.append(fig.add_subplot(11, 4, i+1))
    ax.append(sns.boxplot(y=val, data=train_data[num_cols]))
plt.show()
fig.tight_layout()

In [None]:
# Multicollinearity 
threshold = 0.25
plt.figure(figsize=(10,8))
correlation = train_dataset[num_cols].corr()
sns.heatmap(correlation, mask = correlation < threshold,linecolor='black',linewidth=.5)

## Preprocessing

#### Feature Engineering

##### Removing Features

In [None]:
# Removing Id but keeping a copy for submission
test_ids = test_data['id']

# remove from train_dataset
train_dataset = train_dataset.drop(columns='id')
val_dataset = val_dataset.drop(columns='id')
test_data = test_data.drop(columns='id')

##### Changing age values from days to years

In [None]:
train_dataset['Age'] = train_dataset['Age'] / 365
train_dataset['Age']

##### changing values that has Y/N to 1/0

In [None]:
# Train dataset
train_dataset['Spiders'] = train_dataset['Spiders'].map({'Y': 1, 'N': 0})
train_dataset['Ascites'] = train_dataset['Ascites'].map({'Y': 1, 'N':0 })
train_dataset['Hepatomegaly'] = train_dataset['Hepatomegaly'].map({'Y': 1, 'N': 0})

# validation dataset
val_dataset['Spiders'] = val_dataset['Spiders'].map({'Y': 1, 'N': 0})
val_dataset['Ascites'] = val_dataset['Ascites'].map({'Y': 1, 'N':0 })
val_dataset['Hepatomegaly'] = val_dataset['Hepatomegaly'].map({'Y': 1, 'N': 0})

# test dataset
test_data['Spiders'] = test_data['Spiders'].map({'Y': 1, 'N': 0})
test_data['Ascites'] = test_data['Ascites'].map({'Y': 1, 'N':0 })
test_data['Hepatomegaly'] = test_data['Hepatomegaly'].map({'Y': 1, 'N': 0})

print('values changed from N/Y to 1/0')

train_dataset

#### Categorical Encoding

In [None]:
# we'll use one hot encoding for simplicity
encoder = OneHotEncoder(handle_unknown='ignore')

# get categorical columns
cat_cols = [cname for cname in train_dataset.columns if 
           train_dataset[cname].dtype == "object"]

print(cat_cols)

# fit the encoder in the train it data and use it in other datasets
train_data_encoded = encoder.fit_transform(train_dataset[cat_cols])
val_data_encoded = encoder.transform(val_dataset[cat_cols])
test_data_encoded = encoder.transform(test_data[cat_cols])

# convert to pandas dataframes
train_data_encoded_df = pd.DataFrame(train_data_encoded.toarray(), columns=encoder.get_feature_names_out(cat_cols))
val_data_encoded_df = pd.DataFrame(val_data_encoded.toarray(), columns=encoder.get_feature_names_out(cat_cols))
test_data_encoded_df = pd.DataFrame(test_data_encoded.toarray(), columns=encoder.get_feature_names_out(cat_cols))

# drop original cat_cols
train_dataset = train_dataset.drop(columns=cat_cols, axis=1)
val_dataset = val_dataset.drop(columns=cat_cols, axis=1)
test_data = test_data.drop(columns=cat_cols, axis=1)

# # concat encoded datasets values with original datasets
train_dataset_encoded = pd.concat([train_dataset.reset_index(drop=True), train_data_encoded_df], axis=1)
val_dataset_encoded = pd.concat([val_dataset.reset_index(drop=True), val_data_encoded_df], axis=1)
test_dataset_encoded = pd.concat([test_data.reset_index(drop=True), test_data_encoded_df], axis=1)
val_dataset_encoded
        

#### Normalization

In [None]:
# difference against other attempts we are going to use another type of scaler, StandardScaler
scaler = StandardScaler()

train_dataset_scaled = scaler.fit_transform(train_dataset_encoded)
val_dataset_scaled = scaler.fit_transform(val_dataset_encoded)
test_dataset_scaled = scaler.transform(test_dataset_encoded)

print(np.max(train_dataset_scaled))
print(np.min(train_dataset_scaled))

## Build The Models

##### Target Encoding

In [None]:
# because we have categorical data in our labels, we'll encode them
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.fit_transform(val_labels)

print('values encoded', f'{train_labels_encoded}, {val_labels_encoded}')

#### Models Implementation

Here we are going to use ensemble models, for each we are going to create a function that determines the right parameters to pass into the ensemble, this with the help of the optuna library

##### Light Gradient Boosting Machine

In [None]:

# LightGBM classifier
def LGBM_objective(trial):
    params = {
        'objective': 'multiclass',
        'metric': 'softmax',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1) 
    }

    model = LGBMClassifier(**params)

    # train the model
    model.fit(train_dataset_scaled, train_labels)

    # make predictions
    val_pred_proba = model.predict_proba(val_dataset_scaled)

    # calculate the log loss
    loss = log_loss(val_labels, val_pred_proba)
    return loss

In [None]:
# Perform hyperparameter optimization
lgbm_study = optuna.create_study(direction='minimize')
lgbm_study.optimize(LGBM_objective, n_trials=1500)

In [None]:
# Print the best hyperparameters and corresponding accuracy
lgbm_best_trial = lgbm_study.lgbm_best_trial
print(f"Best Trial - Hyperparameters: {lgbm_best_trial.params}")
print(f"Best Trial - Accuracy: {lgbm_best_trial.value}")

In [None]:
# based on the outputs from the previous cell and the optuna helper, we set the suggested values
lgbm_model = LGBMClassifier(**{'objective': 'multiclass',
                               'metric': 'softmax',
                               'boosting_type': 'gbdt', 'num_leaves': 13,
                               'learning_rate': 0.040717487378551125,
                               'n_estimators': 340,
                               'subsample': 0.7621117946415148,
                               'colsample_bytree': 0.5254951985706161,
                               'reg_alpha': 0.9395639914591739,
                               'reg_lambda': 0.12423847695048462
                               })

##### XGBoost - Extreme Gradient Boosting

In [None]:
def XGBoost_objective(trial):
    params = {
        'objective':'multi:softprob',
        'max_depth': trial.suggest_int('max_depth', 3, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_stimators', 50, 1000, step=100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear','dart'])
    }

    model = XGBClassifier(**params)

    # train the model
    model.fit(train_dataset_scaled, train_labels_encoded)

    # make predictions
    val_pred_proba = model.predict_proba(val_dataset_scaled)

    # calculate the log loss
    loss = log_loss(val_labels_encoded, val_pred_proba)
    return loss

In [None]:
# # Perform hyperparameter optimization
xgb_study = optuna.create_study(direction='minimize')
xgb_study.optimize(XGBoost_objective, n_trials=1500)

In [None]:
# # Print the best hyperparameters and corresponding accuracy
xgb_best_trial = xgb_study.xgb_best_trial
print(f"Best Trial - Hyperparameters: {xgb_best_trial.params}")
print(f"Best Trial - Accuracy: {xgb_best_trial.value}")

In [None]:
xgb_model = XGBClassifier(**{'objective':'multi:softprob',
                           'max_depth': 78, 
                           'learning_rate': 0.08038122449289681, 
                           'n_estimators': 250, 
                           'subsample': 0.6095349691489983, 
                           'colsample_bytree': 0.3967522739768332, 
                           'gamma': 0.9644640960422188, 
                           'booster': 'gbtree'})

##### Catboost - Categorical Boosting

In [None]:
def catBoost_objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000, step=10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.sugges_int('depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bysample', 0.5, 1.0),
        'bootstrap_type': 'Bayesian',
        'verbose': False
    }

    if params['bootstrap_type'] == 'Bayesian':
        del params['subsample']
    
    model = CatBoostClassifier(**params)

    # Train the model
    model.fit(train_dataset_scaled, train_labels_encoded)

    # Make predictions
    val_pred_proba = model.predict_proba(val_dataset_scaled)

    # calculate log loss
    loss = log_loss(val_dataset_scaled, val_pred_proba)
    return loss

In [None]:
# perform parameter optmiziation
catb_study = optuna.create_study(direction='minimize')
catb_study.optimize(catBoost_objective, n_trials=150)

In [None]:
# Print the best hyperparameters and corresponding accuracy
catb_best_trial = catb_study.catb_best_trial
print(f"Best Trial - Hyperparameters: {catb_best_trial.params}")
print(f"Best Trial - Loss: {catb_best_trial.value}")

#### Final Model (Voting)

### Make Predictions

## Submissions

In [None]:
preds = random_forest_model.predict_proba(test_dataset_scaled)

In [None]:
submission = pd.DataFrame({
    "id": test_ids,
    "Status_C": preds[:, 0],
    "Status_CL": preds[:, 1],
    "Status_D": preds[:, 2]
})

submission.head()

In [None]:
# submission.to_csv('./submissions/random_forest_model_submission.csv', index=False, header=True)