In [None]:
pip install thop

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import collections
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import thop

import optuna
from optuna.integration import KerasPruningCallback
from optuna.visualization import plot_contour, plot_edf, plot_intermediate_values,  plot_optimization_history, plot_parallel_coordinate, plot_param_importances, plot_slice 

import torch 
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torchvision

import tensorflow as tf
import keras 
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential

import warnings

In [None]:
data = pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")
data.head()

In [None]:
# Shape of the dataset
data.shape

In [None]:
# Checking duplicated data
data.duplicated().sum()

In [None]:
# checking label or unlabel dataset
sns.countplot(x=data.DEATH_EVENT, data=data)

In [None]:
data.describe()

# Feature Engineering

In [None]:
# Increasing the number of features
def feature_creation(df):
    df['age2'] = df['age']//10
    df['creatinine_phosphokinase2'] = df['creatinine_phosphokinase']//20 #10
    df['creatinine_phosphokinase3'] = df['creatinine_phosphokinase2']//10
    df['ejection_fraction2'] = df['ejection_fraction']//10
    df['platelets2'] = df['platelets']//100
    df['platelets3'] = df['platelets2']//100
    df['platelets4'] = df['platelets3']//10
    df['serum_sodium2'] = df['serum_sodium']//20
    
    for i in ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine', 'serum_sodium','sex', 'smoking','time']:
        for j in ['age2', 'creatinine_phosphokinase2', 'creatinine_phosphokinase3', 'ejection_fraction2', 'platelets2', 'platelets3', 'platelets4','serum_sodium2']:
            df[i + "_" + j] = df[i].astype('str') + "_" + df[j].astype('str')
    return df


In [None]:
data = feature_creation(data)
data.head()

In [None]:
# Categorical columns
cat_columns = []
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
columns = data.columns.values.tolist()

for col in columns:
    if data[col].dtype in numerics: continue
    cat_columns.append(col)
    
print(cat_columns)

In [None]:
# Encoding of categorical features
for col in cat_columns:
    if col in data.columns:
        le = LabelEncoder()
        le.fit(list(data[col].astype(str).values))
        data[col] = le.transform(list(data[col].astype(str).values))
data.head()


In [None]:
print(f"After feature number increased shape if :{data.shape}")

# Shaffle dataset

In [None]:
data = data.sample(frac=1).reset_index(drop=True)

# Feature and Label data

In [None]:
features = data.drop('DEATH_EVENT', axis=1)
label = data.DEATH_EVENT

# Train and Valid

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(features, label, test_size=.2, stratify=label)


# Data Standardize and Normalize

In [None]:
# Data Normalize
scaler = MinMaxScaler()
normalized_xtrain = scaler.fit_transform(x_train)
normalized_xtest = scaler.fit_transform(x_valid)


# Data Standardize
std = StandardScaler()
standardized_xtrain = scaler.fit_transform(normalized_xtrain)
standardized_xtest = scaler.fit_transform(normalized_xtest)

# Using Automatic Hyperparameter Tuning(Optuna)

### xgboost classifier

In [None]:
def objective(trial):
    #data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    #train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
    dtrain = xgb.DMatrix(standardized_xtrain, label=y_train)
    dvalid = xgb.DMatrix(standardized_xtest, label=y_valid)

    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    bst = xgb.train(param, dtrain, evals=[(dvalid, "validation")], callbacks=[pruning_callback])
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_valid, pred_labels)
    return accuracy


In [None]:
study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
    )
study.optimize(objective, n_trials=300)
#print(study.best_trial)

In [None]:
number_trial = len(study.trials)
print('number of trial :\n', number_trial, '\n')

best_trial = study.best_trial
print('best trials :\n', best_trial, '\n')

best_params = study.best_params
print('best parameters :\n', best_params, '\n')

best_value = study.best_value
print('best values :\n', best_value, '\n')

#trial = study.trials
#print('trials :\n', trial, '\n')


### CatBoost classifier

In [None]:
def objective(trial):
    #data, target = load_breast_cancer(return_X_y=True)
    #train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.3)

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = cb.CatBoostClassifier(**param)

    gbm.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=0, early_stopping_rounds=100)

    preds = gbm.predict(x_valid)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_valid, pred_labels)
    return accuracy


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=300, timeout=600)

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

### lightgbm classifier

In [None]:
def objective(trial):
    #data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    #train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
    dtrain = lgb.Dataset(x_train, label=y_train)
    dvalid = lgb.Dataset(x_valid, label=y_valid)

    param = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")
    gbm = lgb.train(
        param, dtrain, valid_sets=[dvalid], verbose_eval=False, callbacks=[pruning_callback]
    )

    preds = gbm.predict(x_valid)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_valid, pred_labels)
    return accuracy


In [None]:
study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="maximize"
    )
study.optimize(objective, n_trials=300)
  

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
# Visualize the optimization history. See :func:`~optuna.visualization.plot_optimization_history` for the details.
plot_optimization_history(study)

In [None]:
# Visualize the learning curves of the trials. See :func:`~optuna.visualization.plot_intermediate_values` for the details.
plot_intermediate_values(study)

In [None]:
# Visualize high-dimensional parameter relationships. See :func:`~optuna.visualization.plot_parallel_coordinate` for the details.
plot_parallel_coordinate(study)

In [None]:
# Select parameters to visualize.
plot_parallel_coordinate(study, params=["bagging_freq", "bagging_fraction"])

In [None]:
# Select parameters to visualize.
plot_contour(study, params=["bagging_freq", "bagging_fraction"])

In [None]:
# Visualize individual hyperparameters as slice plot. See :func:`~optuna.visualization.plot_slice` for the details.
plot_slice(study)

In [None]:
# Select parameters to visualize.
plot_slice(study, params=["bagging_freq", "bagging_fraction"])

In [None]:
# Visualize parameter importances. See :func:`~optuna.visualization.plot_param_importances` for the details.
plot_param_importances(study)

In [None]:
# Visualize empirical distribution function. See :func:`~optuna.visualization.plot_edf` for the details.
plot_edf(study)

# Using Optuna with Keras

In [None]:
def create_model(trial):
    # We define our MLP.
    n_layers = trial.suggest_int("n_layers", 1, 3)
    model = Sequential()
    for i in range(n_layers):
        num_hidden = trial.suggest_int("n_units_l{}".format(i), 4, 128, log=True)
        model.add(Dense(num_hidden, activation="relu"))
        dropout = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5)
        model.add(Dropout(rate=dropout))
    model.add(Dense(2, activation="softmax"))

#     model.add(Dense(116, activation=tf.nn.relu))
#     model.add(Dense(58, activation=tf.nn.relu))
#     model.add(Dense(2, activation=tf.nn.softmax))

    
    # We compile our model with a sampled learning rate.
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    model.compile(
        loss="categorical_crossentropy",
        optimizer=keras.optimizers.RMSprop(lr=lr),
        metrics=["accuracy"],
    )
    return model
    

In [None]:
def create_model2(trial):

    # Hyperparameters to be tuned by Optuna.
    lr = trial.suggest_float("lr", 1e-4, 1e-1, log=True)
    momentum = trial.suggest_float("momentum", 0.0, 1.0)
    units = trial.suggest_categorical("units", [32, 64, 128, 256, 512])

    # Compose neural network with one hidden layer.
    model = tf.keras.Sequential()
    #model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(units=116, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dense(units=58, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dense(2, activation=tf.nn.softmax))

    # Compile model.
    model.compile(
        optimizer=tf.keras.optimizers.SGD(lr=lr, momentum=momentum, nesterov=True),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )

    return model

In [None]:
def objective(trial):
    # Clear clutter from previous session graphs.
    keras.backend.clear_session()
    
    # Dataset Loading
    x_train_ = x_train.values.astype('float32') / 255
    x_valid_ = x_valid.values.astype('float32') / 255
    
    y_train_ = keras.utils.to_categorical(y_train.values, 2)
    y_valid_ = keras.utils.to_categorical(y_valid.values, 2)
    
    # Generate our trial model.
    model = create_model(trial)
    
    # Fit the model on the training data.
    # The KerasPruningCallback checks for pruning condition every epoch.
    model.fit(
        x_train_,
        y_train_,
        batch_size=32,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(patience=3),
            KerasPruningCallback(trial, "val_accuracy"),
        ],
        epochs=20,
        validation_data=(x_valid_, y_valid_),
        verbose=1,
    )
    
    # Evaluate the model accuracy on the validation set.
    score = model.evaluate(x_valid_, y_valid_, verbose=0)
    return score[1]

In [None]:
if __name__ == "__main__":
    warnings.warn(
        "Recent Keras release (2.4.0) simply redirects all APIs "
        "in the standalone keras package to point to tf.keras. "
        "There is now only one Keras: tf.keras. "
        "There may be some breaking changes for some workflows by upgrading to keras 2.4.0. "
        "Test before upgrading. "
        "REF:https://github.com/keras-team/keras/releases/tag/2.4.0"
    )
    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=100)

In [None]:
pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
plot_optimization_history(study)

In [None]:
# Visualize the learning curves of the trials. See :func:`~optuna.visualization.plot_intermediate_values` for the details.
plot_intermediate_values(study)

In [None]:
# Visualize high-dimensional parameter relationships. See :func:`~optuna.visualization.plot_parallel_coordinate` for the details.
plot_parallel_coordinate(study)

In [None]:
# Visualize individual hyperparameters as slice plot. See :func:`~optuna.visualization.plot_slice` for the details.
plot_slice(study)

In [None]:
# Visualize parameter importances. See :func:`~optuna.visualization.plot_param_importances` for the details.
plot_param_importances(study)

In [None]:
# Visualize empirical distribution function. See :func:`~optuna.visualization.plot_edf` for the details.
plot_edf(study)