In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
#for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading

In [None]:
train_data = pd.read_csv('/kaggle/input/Kannada-MNIST/train.csv')
valid_data = pd.read_csv('/kaggle/input/Kannada-MNIST/Dig-MNIST.csv')

In [None]:
import random
random.seed(32)
np.random.seed(42)
train_data = train_data.sample(frac=1,random_state=52).reset_index(drop=True)
valid_data = valid_data.sample(frac=1,random_state=62).reset_index(drop=True)
X_train, y_train = (train_data.drop(['label'], axis=1), train_data.label)
X_valid, y_valid = (valid_data.drop(['label'], axis=1), valid_data.label)
#X_train = X_train/255. # We don't have to normalize our data
#X_valid = X_valid/255. # because https://datascience.stackexchange.com/questions/60950/is-it-necessary-to-normalize-data-for-xgboost 

In [None]:
len(X_train), len(X_valid)

In [None]:
import time

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import accuracy_score
import random

# Hyperparameter tuning

I used https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663

My idea is to use a multi-phase random training.

1. Train a lot of models on a small training set (random set of the original train.csv) ---> Evaluate them on the validation set (Dig-MNIST.csv) ---> save the best ones
2. These define a narrower range of parameters. We'll train a lot of models (but less then in the 1st step) on the medium-size dataset. ---> Evaluate them on the validation set (Dig-MNIST.csv) ---> save the best ones
3. These define a narrower range of parameters. We'll train some models on this parameter-set randomly again and choose the best model for the competition.


We'll tune the following parameters:

* `max_depth`
* `colsample_bytree`
* `n_estimators`
* `learning_rate`
* `subsample`
* `reg_lambda`

In [None]:
def train(train_set_size,max_depth,colsample_bytree,n_estimators,learning_rate,subsample,reg_lambda):
    """
    Train an XGBoost classifier with these parameters and returns the trained model
    """
    #start = time.time()
    if train_set_size<1.0:
        train_data_sampled = train_data.sample(frac=train_set_size).reset_index(drop=True)
    else:
        train_data_sampled = train_data.copy()
    #print(len(train_data_sampled))
    X_train_sampled, y_train_sampled = (train_data_sampled.drop(['label'], axis=1), train_data_sampled.label)
    
    clf = XGBClassifier(use_label_encoder = False,
                        eval_metric = 'mlogloss',
                        num_class = 10,
                        max_depth = max_depth, 
                        colsample_bytree = colsample_bytree,
                        n_estimators = n_estimators,
                        learning_rate = learning_rate,
                        subsample = subsample,
                        reg_lambda = reg_lambda,
                       )
    clf.fit(X_train_sampled,y_train_sampled)
    #end = time.time()
    #print(f"T: {end-start:.2f}s")
    return clf
    

In [None]:
start = time.time()
model = train(train_set_size = 1.0,
              max_depth=5,
              colsample_bytree = 1.0,
              n_estimators = 10,
              learning_rate = 0.2,
              subsample = 1.0,
              reg_lambda = 100)
end = time.time()
print(f"Time: {end-start:.2f}s")             

In [None]:
start = time.time()
y_pred = model.predict(X_valid)
end = time.time()
print(f"Time: {end-start:.2f}s")

print(accuracy_score(y_valid, y_pred))

## 1st round

In [None]:
MAX_PARAM_NUM_1 = 80
MAX_PARAM_NUM_1_BEST = 5
parameters_1 = pd.DataFrame()
np.random.seed(112)
for i in range(MAX_PARAM_NUM_1):
    max_depth = np.random.randint(3,21)
    n_estimators = np.random.randint(20,200)
    learning_rate = np.random.rand()*(0.4-0.01)+0.01
    colsample_bytree = np.random.rand()*(1.0-0.1)+0.1
    subsample = np.random.rand()*(1.0-0.1)+0.1
    reg_lambda = np.random.rand()*100.0
    parameters_1 = parameters_1.append({"max_depth":max_depth,
                                        "n_estimators":n_estimators,
                                        "learning_rate":learning_rate,
                                        "colsample_bytree":colsample_bytree,
                                        "subsample":subsample,
                                        "reg_lambda":reg_lambda,
                                        "valid_acc":i},ignore_index=True)
parameters_1["max_depth"] = parameters_1["max_depth"].astype(int)
parameters_1["n_estimators"] = parameters_1["n_estimators"].astype(int)

In [None]:
valid_accs = []
start = time.time()
for i in range(MAX_PARAM_NUM_1):
    params = parameters_1.iloc[i]
    max_depth = int(params["max_depth"])
    n_estimators = int(params["n_estimators"])
    learning_rate = params["learning_rate"]
    colsample_bytree = params["colsample_bytree"]
    subsample = params["subsample"]
    reg_lambda = params["reg_lambda"]
    #print(f"{i+1}\n----\n")
    #print(f"max_depth {max_depth}")
    #print(f"n_estimators {n_estimators}")
    #print(f"learning_rate {learning_rate}")
    #print(f"colsample_bytree {colsample_bytree}")
    #print(f"subsample {subsample}")
    #print(f"reg_lambda {reg_lambda}")
    model = train(train_set_size = 0.05,
                  max_depth = max_depth,
                  colsample_bytree = colsample_bytree,
                  n_estimators = n_estimators,
                  learning_rate = learning_rate,
                  subsample = subsample,
                  reg_lambda = reg_lambda)
    y_pred = model.predict(X_valid)
    valid_accs.append(accuracy_score(y_valid, y_pred))
    end = time.time()
    print(f"[{i+1}/{MAX_PARAM_NUM_1}] VA: {accuracy_score(y_valid, y_pred):.4f} | ET: {end-start:.2f}s")
parameters_1["valid_acc"]=valid_accs

In [None]:
parameters_1

In [None]:
min_parameters_1 = parameters_1.sort_values("valid_acc",ascending=False).head(MAX_PARAM_NUM_1_BEST).min()
max_parameters_1 = parameters_1.sort_values("valid_acc",ascending=False).head(MAX_PARAM_NUM_1_BEST).max()


In [None]:
min_parameters_1

In [None]:
max_parameters_1

## 2nd round

In [None]:
MAX_PARAM_NUM_2 = 20
MAX_PARAM_NUM_2_BEST = 5
parameters_2 = pd.DataFrame()
for i in range(MAX_PARAM_NUM_2):
    max_depth_MAX = int(max_parameters_1["max_depth"]); max_depth_MIN = int(min_parameters_1["max_depth"])
    n_estimators_MAX  = int(max_parameters_1["n_estimators"]); n_estimators_MIN = int(min_parameters_1["n_estimators"])
    learning_rate_MAX = max_parameters_1["learning_rate"]; learning_rate_MIN = min_parameters_1["learning_rate"]
    colsample_bytree_MAX = max_parameters_1["colsample_bytree"]; colsample_bytree_MIN = min_parameters_1["colsample_bytree"]
    subsample_MAX = max_parameters_1["subsample"]; subsample_MIN = min_parameters_1["subsample"]
    reg_lambda_MAX = max_parameters_1["reg_lambda"]; reg_lambda_MIN = min_parameters_1["reg_lambda"]
    
    max_depth = np.random.randint(max_depth_MIN,max_depth_MAX+1)
    n_estimators = np.random.randint(n_estimators_MIN,n_estimators_MAX+1)
    learning_rate = np.random.rand()*(learning_rate_MAX - learning_rate_MIN)+learning_rate_MIN
    colsample_bytree = np.random.rand()*(colsample_bytree_MAX - colsample_bytree_MIN)+colsample_bytree_MIN
    subsample = np.random.rand()*(subsample_MAX - subsample_MIN)+subsample_MIN
    reg_lambda = np.random.rand()*(reg_lambda_MAX - reg_lambda_MIN)+reg_lambda_MIN
    
    parameters_2 = parameters_2.append({"max_depth":max_depth,
                                        "n_estimators":n_estimators,
                                        "learning_rate":learning_rate,
                                        "colsample_bytree":colsample_bytree,
                                        "subsample":subsample,
                                        "reg_lambda":reg_lambda,
                                        "valid_acc":i},ignore_index=True)
parameters_2["max_depth"] = parameters_2["max_depth"].astype(int)
parameters_2["n_estimators"] = parameters_2["n_estimators"].astype(int)

In [None]:
parameters_2

In [None]:
valid_accs = []
start = time.time()
for i in range(MAX_PARAM_NUM_2):
    params = parameters_2.iloc[i]
    max_depth = int(params["max_depth"])
    n_estimators = int(params["n_estimators"])
    learning_rate = params["learning_rate"]
    colsample_bytree = params["colsample_bytree"]
    subsample = params["subsample"]
    reg_lambda = params["reg_lambda"]
    #print(f"{i+1}\n----\n")
    #print(f"max_depth {max_depth}")
    #print(f"n_estimators {n_estimators}")
    #print(f"learning_rate {learning_rate}")
    #print(f"colsample_bytree {colsample_bytree}")
    #print(f"subsample {subsample}")
    #print(f"reg_lambda {reg_lambda}")
    model = train(train_set_size = 0.15,
                  max_depth = max_depth,
                  colsample_bytree = colsample_bytree,
                  n_estimators = n_estimators,
                  learning_rate = learning_rate,
                  subsample = subsample,
                  reg_lambda = reg_lambda)
    y_pred = model.predict(X_valid)
    valid_accs.append(accuracy_score(y_valid, y_pred))
    end = time.time()
    print(f"[{i+1}/{MAX_PARAM_NUM_2}] VA: {accuracy_score(y_valid, y_pred):.4f} | ET: {end-start:.2f}s")
parameters_2["valid_acc"]=valid_accs

In [None]:
min_parameters_2 = parameters_2.sort_values("valid_acc",ascending=False).head(MAX_PARAM_NUM_2_BEST).min()
max_parameters_2 = parameters_2.sort_values("valid_acc",ascending=False).head(MAX_PARAM_NUM_2_BEST).max()

In [None]:
min_parameters_2

In [None]:
max_parameters_2

## 3rd round

In [None]:
MAX_PARAM_NUM_3 = 16
MAX_PARAM_NUM_3_BEST = 16
parameters_3 = pd.DataFrame()
for i in range(MAX_PARAM_NUM_3):
    max_depth_MAX = int(max_parameters_2["max_depth"]); max_depth_MIN = int(min_parameters_2["max_depth"])
    n_estimators_MAX  = int(max_parameters_2["n_estimators"]); n_estimators_MIN = int(min_parameters_2["n_estimators"])
    learning_rate_MAX = max_parameters_2["learning_rate"]; learning_rate_MIN = min_parameters_2["learning_rate"]
    colsample_bytree_MAX = max_parameters_2["colsample_bytree"]; colsample_bytree_MIN = min_parameters_2["colsample_bytree"]
    subsample_MAX = max_parameters_2["subsample"]; subsample_MIN = min_parameters_2["subsample"]
    reg_lambda_MAX = max_parameters_2["reg_lambda"]; reg_lambda_MIN = min_parameters_2["reg_lambda"]
    
    max_depth = np.random.randint(max_depth_MIN,max_depth_MAX+1)
    n_estimators = np.random.randint(n_estimators_MIN,n_estimators_MAX+1)
    learning_rate = np.random.rand()*(learning_rate_MAX - learning_rate_MIN)+learning_rate_MIN
    colsample_bytree = np.random.rand()*(colsample_bytree_MAX - colsample_bytree_MIN)+colsample_bytree_MIN
    subsample = np.random.rand()*(subsample_MAX - subsample_MIN)+subsample_MIN
    reg_lambda = np.random.rand()*(reg_lambda_MAX - reg_lambda_MIN)+reg_lambda_MIN
    
    parameters_3 = parameters_3.append({"max_depth":max_depth,
                                        "n_estimators":n_estimators,
                                        "learning_rate":learning_rate,
                                        "colsample_bytree":colsample_bytree,
                                        "subsample":subsample,
                                        "reg_lambda":reg_lambda,
                                        "valid_acc":i},ignore_index=True)
parameters_3["max_depth"] = parameters_3["max_depth"].astype(int)
parameters_3["n_estimators"] = parameters_3["n_estimators"].astype(int)

In [None]:
parameters_3

In [None]:
valid_accs = []
start = time.time()
models = []
for i in range(MAX_PARAM_NUM_3):
    params = parameters_3.iloc[i]
    max_depth = int(params["max_depth"])
    n_estimators = int(params["n_estimators"])
    learning_rate = params["learning_rate"]
    colsample_bytree = params["colsample_bytree"]
    subsample = params["subsample"]
    reg_lambda = params["reg_lambda"]
    #print(f"{i+1}\n----\n")
    #print(f"max_depth {max_depth}")
    #print(f"n_estimators {n_estimators}")
    #print(f"learning_rate {learning_rate}")
    #print(f"colsample_bytree {colsample_bytree}")
    #print(f"subsample {subsample}")
    #print(f"reg_lambda {reg_lambda}")
    model = train(train_set_size = 1.0,
                  max_depth = max_depth,
                  colsample_bytree = colsample_bytree,
                  n_estimators = n_estimators,
                  learning_rate = learning_rate,
                  subsample = subsample,
                  reg_lambda = reg_lambda)
    models.append(model)
    y_pred = model.predict(X_valid)
    valid_accs.append(accuracy_score(y_valid, y_pred))
    end = time.time()
    print(f"[{i+1}/{MAX_PARAM_NUM_3}] VA: {accuracy_score(y_valid, y_pred):.4f} | ET: {end-start:.2f}s")
parameters_3["valid_acc"]=valid_accs

In [None]:
parameters_3.sort_values("valid_acc",ascending=False)

In [None]:
parameters_3

In [None]:
max_ind = parameters_3["valid_acc"].argmax()

In [None]:
clf = models[max_ind]

In [None]:
y_pred = clf.predict(X_valid)

In [None]:
accuracy_score(y_valid, y_pred)

# Submission

In [None]:
test_data = pd.read_csv('/kaggle/input/Kannada-MNIST/test.csv')
#test_data = pd.read_csv('/kaggle/input/Kannada-MNIST/test.csv')

In [None]:
ids, test_set = test_data.id, test_data.drop(['id'], axis=1)

In [None]:
final_preds = clf.predict(test_set)

In [None]:
pd.Series(final_preds, index=ids, name='label').to_csv('/kaggle/working/submission.csv')
#pd.Series(final_preds, index=ids, name='label').to_csv('/kaggle/working/submission.csv')