In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#!pip install optuna

In [None]:
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, confusion_matrix
import random
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import time

# environment: {'kaggle','colab','jupyter'}
env = 'kaggle'

# parameter tuning
param_tuning = False

# random seed
RANDOM_SEED = 17

# models to run
models = ['gb','hgb','xgb','ada']
#models = ['xgb']

# start time
start = time.time()

# Read data

In [None]:
if env == 'kaggle':
  path = '/kaggle/input/Kannada-MNIST/'

elif env == 'colab':
  from google.colab import drive
  drive.mount('/content/drive')

  path = '/content/drive/MyDrive/01_Tanulmanyok/ELTE AI&ML/02_Spring/Data Mining/Homeworks/Homework3/input/'

elif env == 'jupyter':
  path = './input/'

train_data = pd.read_csv(path + 'train.csv')
test_data = pd.read_csv(path + 'test.csv')

print(train_data.shape)
train_data.head()

## Train-test split

In [None]:
X, y = (train_data.drop(['label'], axis=1), train_data.label)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train.shape

# Auxiliary functions

In [None]:
# function to create plots about the model results
def get_figures(model, X_train, X_test, y_train, y_test):
  y_pred = model.predict(X_test)
  tr_pred = model.predict(X_train)

  print(f'Train accuracy: {accuracy_score(y_train, tr_pred)}')
  print(f'Val accuracy: {accuracy_score(y_test, y_pred)}')

  y_prob = model.predict_proba(X_test)[:, 1]
  fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(16,10))
  plt.subplots_adjust(wspace=0.4, hspace=0.4)
  fig.delaxes(ax4)

  # confusion_matrix
  C = confusion_matrix(y_test, y_pred)
  sns.heatmap(C, ax=ax1, annot=True, fmt='g', cmap='icefire_r',
  xticklabels=['False', 'True'], yticklabels=['False', 'True'])
  ax1.set_ylabel('Actual')
  ax1.set_xlabel('Predicted')
  ax1.set_title('Confusion matrix')
  plt.setp(ax1.get_yticklabels(), va="center")

  # ROC curve
  fpr, tpr, thresholds = roc_curve(y_test, y_prob)
  sns.lineplot(fpr, tpr, ax=ax2, alpha=0.6, ci=None)
  ax2.plot([0, 1], [0, 1], 'k--')
  ax2.set_xlabel('False positive rate')
  ax2.set_ylabel('True positive rate')
  ax2.set_title('ROC curve')
  ax2.legend(['Baseline', type(model).__name__])
  num = round(roc_auc_score(y_test, y_prob)*100,2)
  ax2.text(0.65, 0.25, 'AUC='+str(num)+'%', fontsize=15)

In [None]:
def get_accuracy_by_params(model, param, param_vals, n_sample=10000):
  idxs = random.sample(list(X_train.index), n_sample)
  grid = GridSearchCV(estimator=model, param_grid={param:param_vals}, cv=3, n_jobs=-1, scoring='accuracy')
  grid.fit(X_train.loc[idxs], y_train.loc[idxs])
  print(grid.cv_results_['mean_test_score'])
  fig, ax = plt.subplots(1,1,figsize=(20,6))
  ax = plt.plot(grid.cv_results_['mean_test_score'])
  plt.xticks(range(len(param_vals)), param_vals)

# Modelling

## Gradient Boosting

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

### Initial hyperparameter tuning with GridSearchCV

In [None]:
if param_tuning and 'gb' in models:
  get_accuracy_by_params(GradientBoostingClassifier(), 'max_depth', [4,5,6,7,8], n_sample=1000)

In [None]:
if param_tuning and 'gb' in models:
  get_accuracy_by_params(GradientBoostingClassifier(), 'n_estimators', [100,150,200], n_sample=1000)

In [None]:
if param_tuning and 'gb' in models:
  get_accuracy_by_params(GradientBoostingClassifier(), 'learning_rate', [0.05, 0.1, 0.5, 1], n_sample=1000)

### Final hyperarameter tuning with Optuna

In [None]:
def objective_gb(trial):
  n_estimators = trial.suggest_int('n_estimators', 150, 200)
  max_depth = trial.suggest_int('max_depth', 4, 5)
  learning_rate = trial.suggest_float('learning_rate', 0.1, 0.5)

  params = {
      'n_estimators': n_estimators,
      'max_depth': max_depth,
      'learning_rate': learning_rate
  }

  gb = GradientBoostingClassifier(random_state=RANDOM_SEED)
  gb.set_params(**params)

  idxs = random.sample(list(X_train.index), 1000)
  return -np.mean(cross_val_score(gb, X_train.loc[idxs], y_train.loc[idxs], cv=3))

In [None]:
if param_tuning and 'gb' in models:
  gb_study = optuna.create_study()
  gb_study.optimize(objective_gb, n_trials=10, timeout=2000)
  params = gb_study.best_params
else:
  params = {'learning_rate': 0.1333, 'max_depth': 5, 'n_estimators': 161} #0.984666
print(params)

In [None]:
if 'gb' in models:
  gb = GradientBoostingClassifier()
  gb.set_params(**params)
  gb.fit(X_train, y_train)

### Evaluation

In [None]:
if 'gb' in models:
  y_pred_gb = gb.predict(X_test)
  print(accuracy_score(y_test, y_pred_gb))

## Hist-Gradient Boosting

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html

### Initial hyperparameter tuning with GridSearchCV

In [None]:
if param_tuning and 'hgb' in models:
  get_accuracy_by_params(HistGradientBoostingClassifier(), 'learning_rate', [0.05, 0.1, 0.5, 1], n_sample=10000)

In [None]:
if param_tuning and 'hgb' in models:
  get_accuracy_by_params(HistGradientBoostingClassifier(learning_rate = 0.1), 'max_depth', [5,7,10,15,25,50], n_sample=10000)

In [None]:
if param_tuning and 'hgb' in models:
  get_accuracy_by_params(HistGradientBoostingClassifier(learning_rate=0.1, max_depth=10), 'max_leaf_nodes', [30, 50, 75, 100], n_sample=10000)

### Final hyperarameter tuning with Optuna

In [None]:
def objective_hgb(trial):
  n_estimators = trial.suggest_int('max_leaf_nodes', 70, 80)
  max_depth = trial.suggest_int('max_depth', 8, 12)
  learning_rate = trial.suggest_float('learning_rate', 0.05, 0.15)

  params = {
      'max_leaf_nodes': n_estimators,
      'max_depth': max_depth,
      'learning_rate': learning_rate
  }

  hgb = HistGradientBoostingClassifier(random_state=RANDOM_SEED)
  hgb.set_params(**params)

  idxs = random.sample(list(X_train.index), 1000)
  return -np.mean(cross_val_score(hgb, X_train.loc[idxs], y_train.loc[idxs], cv=3))

In [None]:
if param_tuning and 'hgb' in models:
  hgb_study = optuna.create_study()
  hgb_study.optimize(objective_hgb, n_trials=10, timeout=2000)
  params = hgb_study.best_params
else:
  params = {'learning_rate': 0.136, 'max_depth': 9, 'max_leaf_nodes': 78} #0.9846666
print(params)

In [None]:
if 'hgb' in models:
  hgb = HistGradientBoostingClassifier()
  hgb.set_params(**params)
  hgb.fit(X_train, y_train)

### Evaluation

In [None]:
if 'hgb' in models:
  y_pred_hgb = gb.predict(X_test)
  print(accuracy_score(y_test, y_pred_hgb))

## XGB

https://xgboost.readthedocs.io/en/stable/python/python_api.html

### Initial hyperparameter tuning with GridSearchCV

In [None]:
if param_tuning and 'xgb' in models:
  get_accuracy_by_params(XGBClassifier(), 'max_depth', [4,5,6,7,8], n_sample=1000)

In [None]:
if param_tuning and 'xgb' in models:
  get_accuracy_by_params(XGBClassifier(max_depth=6), 'n_estimators', [50,100,150,200], n_sample=1000)

In [None]:
if param_tuning and 'xgb' in models:
  get_accuracy_by_params(XGBClassifier(), 'learning_rate', [0.05, 0.1, 0.5, 1], n_sample=1000)

In [None]:
if param_tuning and 'xgb' in models:
  get_accuracy_by_params(XGBClassifier(), 'colsample_bytree', [0.1, 0.2, 0.3, 0.4], n_sample=1000)

In [None]:
if param_tuning and 'xgb' in models:
  get_accuracy_by_params(XGBClassifier(), 'subsample', [0.2, 0.3, 0.4, 0.5], n_sample=1000)

### Final hyperarameter tuning with Optuna

In [None]:
def objective_xgb(trial):
  n_estimators = trial.suggest_int('n_estimators', 140, 180)
  max_depth = trial.suggest_int('max_depth', 5, 6)
  learning_rate = trial.suggest_float('learning_rate', 0.3, 0.7)
  colsample_bytree = trial.suggest_float('colsample_bytree', 0.05, 0.15)
  subsample = trial.suggest_float('subsample', 0.5, 0.6)

  params = {
      'n_estimators': n_estimators,
      'max_depth': max_depth,
      'learning_rate': learning_rate,
      'colsample_bytree': colsample_bytree,
      'subsample': subsample
  }

  xgb = XGBClassifier(random_state=RANDOM_SEED)
  xgb.set_params(**params)

  idxs = random.sample(list(X_train.index), 1000)
  return -np.mean(cross_val_score(xgb, X_train.loc[idxs], y_train.loc[idxs], cv=3))

In [None]:
if param_tuning and 'xgb' in models:
  xgb_study = optuna.create_study()
  xgb_study.optimize(objective_gb, n_trials=10, timeout=2000)
  params = xgb_study.best_params
else:
  params = {'learning_rate': 0.1784, 'max_depth': 4, 'n_estimators': 184} #0.98475
print(params)

In [None]:
if 'xgb' in models:
  xgb = XGBClassifier()
  xgb.set_params(**params)
  xgb.fit(X_train, y_train)

### Evaluation

In [None]:
if 'xgb' in models:
  y_pred_xgb = xgb.predict(X_test)
  print(accuracy_score(y_test, y_pred_xgb))

## AdaBoost

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

### Initial hyperparameter tuning with GridSearchCV

In [None]:
if param_tuning and 'ada' in models:
  get_accuracy_by_params(AdaBoostClassifier(), 'learning_rate', [0.001, 0.05, 0.1, 0.5, 1], n_sample=1000)

In [None]:
if param_tuning and 'ada' in models:
  get_accuracy_by_params(AdaBoostClassifier(), 'n_estimators', [10,50,100,150,200,250], n_sample=1000)

In [None]:
if param_tuning and 'ada' in models:
  get_accuracy_by_params(AdaBoostClassifier(), 'algorithm', ['SAMME', 'SAMME.R'], n_sample=1000)

### Final hyperarameter tuning with Optuna

In [None]:
def objective_ada(trial):
  n_estimators = trial.suggest_int('n_estimators', 230, 270)
  algorithm = trial.suggest_categorical('algorithm', ['SAMME'])
  learning_rate = trial.suggest_float('learning_rate', 0.03, 0.07)
  
  params = {
      'n_estimators': n_estimators,
      'algorithm': algorithm,
      'learning_rate': learning_rate
  }

  ada = AdaBoostClassifier(random_state=RANDOM_SEED)
  ada.set_params(**params)

  idxs = random.sample(list(X_train.index), 1000)
  return -np.mean(cross_val_score(ada, X_train.loc[idxs], y_train.loc[idxs], cv=3))

In [None]:
if param_tuning and 'ada' in models:
  ada_study = optuna.create_study()
  ada_study.optimize(objective_ada, n_trials=10, timeout=2000)
  params = ada_study.best_params
else:
  params = {'learning_rate': 0.063588, 'n_estimators': 232, 'algorithm': 'SAMME'} #0.54658
print(params)

In [None]:
if 'ada' in models:
  ada = AdaBoostClassifier()
  ada.set_params(**params)
  ada.fit(X_train, y_train)

### Evaluation

In [None]:
if 'ada' in models:
  y_pred_ada = ada.predict(X_test)
  print(accuracy_score(y_test, y_pred_ada))

## Eval of mixed prediction

In [None]:
def get_most_common_vals(list_of_values):
  return [max(set(p), key = list(p).count) for p in np.vstack(list_of_values).T]

In [None]:
y_pred = get_most_common_vals([y_pred_xgb, y_pred_gb, y_pred_hgb])
print(accuracy_score(y_test, y_pred))

# Prediction

In [None]:
ids, test_set = test_data.id, test_data.drop(['id'], axis=1)
xgb_preds = xgb.predict(test_set)
gb_preds = gb.predict(test_set)
hgb_preds = hgb.predict(test_set)

final_preds = get_most_common_vals([xgb_preds, gb_preds, hgb_preds])

In [None]:
pd.Series(final_preds, index=ids, name='label').to_csv('/kaggle/working/submission.csv')

In [None]:
end = time.time()
print(f'Time of execution:', time.strftime('%H:%M:%S',time.gmtime(end-start)))