In [7]:
# Libraries
import numpy as np
import pandas as pd
import pickle

from lightgbm import LGBMClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

import sys
import warnings

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Data
train = pd.read_csv('./input/train.csv')
test  = pd.read_csv('./input/test.csv')

target   = train.columns[0]
features = train.columns[1:]

In [4]:
# Preprocessing
train[features] = train[features].astype('float64')
test[features]  = test[features].astype('float64')

train[features] = train[features]/255
test[features]  = test[features]/255

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: float64(784), int64(1)
memory usage: 251.5 MB


In [21]:
def objective(trial):
    hyper_params = {
        'n_estimators' : 300,
        'num_leaves': trial.suggest_int('num_leaves', 31, 127),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 100),
        'subsample': trial.suggest_float('subsample', 0, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0, 1.0),
        'reg_alpha' : trial.suggest_float('reg_alpha', 1E-12, 20, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1E-12, 20, log=True),
        'max_depth' : trial.suggest_int('max_depth', 5, 30),
        'max_bin'   : trial.suggest_int('max_bin', 6, 127),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 10, 100),
        'verbosity':-1,
    }

    # Evaluation
    scores = []
    kf = KFold(n_splits=5)
    for i, (train_idx, test_idx) in enumerate(kf.split(train)):

        X_train = train.iloc[train_idx][features]
        y_train = train.iloc[train_idx][target]
        X_test  = train.iloc[test_idx][features]
        y_test  = train.iloc[test_idx][target]        

        
        estimator = LGBMClassifier(**hyper_params)

        estimator.fit(X_train, 
                      y_train, 
                      eval_set=(X_test, y_test), 
                      eval_metric='multi_logloss',
                      verbose=1000)

        y_pred = estimator.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        scores.append(acc)

    return np.mean(scores)

In [22]:
# Optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=3600*0.3)

In [23]:
# Best score
study.best_value

0.9778333333333332

In [24]:
len(study.trials)

4

In [25]:
# Historic
plot_optimization_history(study)

In [26]:
# Importance
plot_param_importances(study)

In [27]:
study.best_params

{'num_leaves': 57,
 'min_child_samples': 57,
 'subsample_freq': 25,
 'subsample': 0.9446405903470416,
 'colsample_bytree': 0.8754116728171276,
 'reg_alpha': 7.178647452970982e-08,
 'reg_lambda': 3.883586850962717e-07,
 'max_depth': 18,
 'max_bin': 127,
 'min_data_per_group': 29}

In [28]:
# Best params
best_params = study.best_params
best_params

{'num_leaves': 57,
 'min_child_samples': 57,
 'subsample_freq': 25,
 'subsample': 0.9446405903470416,
 'colsample_bytree': 0.8754116728171276,
 'reg_alpha': 7.178647452970982e-08,
 'reg_lambda': 3.883586850962717e-07,
 'max_depth': 18,
 'max_bin': 127,
 'min_data_per_group': 29}

In [29]:
with open("study_no_step.p", "wb") as f:
    pickle.dump(study, f)

-----------

# Hard Voting from OOF

In [34]:
# Evaluation
k = 5
test[target] = 0

scores = []

#kf = KFold(k)
kf = KFold(n_splits=k, random_state=56, shuffle=True)
for i, (train_idx, test_idx) in enumerate(kf.split(train)):
    
    print("="*50)
    print("\t{0} KFold".format(i+1))
    print("="*50)
    
    X_train = train.iloc[train_idx][features]
    y_train = train.iloc[train_idx][target]
    X_test  = train.iloc[test_idx][features]
    y_test  = train.iloc[test_idx][target]    

    
    best_params['learning_rate'] = 0.005
    best_params['n_estimators'] = 300
    
    estimator = LGBMClassifier(**best_params)

    estimator.fit(X_train, 
                  y_train, 
                  eval_set=(X_test, y_test), 
                  eval_metric='multi_logloss',
                  early_stopping_rounds=100,
                  verbose=100)

    y_pred = estimator.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    scores.append(acc)

    test[target] += estimator.predict(test[features]) / k

test[target].to_csv('intermediate_results.csv')

	1 KFold
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 0.938944
[200]	valid_0's multi_logloss: 0.523885
[300]	valid_0's multi_logloss: 0.333357
Did not meet early stopping. Best iteration is:
[300]	valid_0's multi_logloss: 0.333357
	2 KFold
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 0.938282
[200]	valid_0's multi_logloss: 0.523507
[300]	valid_0's multi_logloss: 0.33498
Did not meet early stopping. Best iteration is:
[300]	valid_0's multi_logloss: 0.33498
	3 KFold
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 0.943216
[200]	valid_0's multi_logloss: 0.52889
[300]	valid_0's multi_logloss: 0.339564
Did not meet early stopping. Best iteration is:
[300]	valid_0's multi_logloss: 0.339564
	4 KFold
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 0.932736
[200]	valid_0's multi_logloss: 0.518869
[300]	valid_0's

In [35]:
print(f"Expected score: {np.mean(scores)}")

Expected score: 0.9499047619047619


In [36]:
print(f"Expected score: {np.std(scores)}")

Expected score: 0.002236574964917573


-----

# Soft Voting from OOF

In [None]:
# Evaluation
k = 10
test[target] = 0

scores = []

#kf = KFold(k)
kf = KFold(n_splits=k, random_state=56, shuffle=True)
for i, (train_idx, test_idx) in enumerate(kf.split(train)):
    
    print("="*50)
    print("\t{0} KFold".format(i+1))
    print("="*50)
    
    
    X_train = train.iloc[train_idx][features]
    y_train = train.iloc[train_idx][target]
    X_test  = train.iloc[test_idx][features]
    y_test  = train.iloc[test_idx][target]    

    
    best_params['learning_rate'] = 0.005
    best_params['n_estimators'] = 10000
    
    estimator = LGBMClassifier(**best_params)

    estimator.fit(X_train, 
                  y_train, 
                  eval_set=(X_test, y_test), 
                  eval_metric='multi_logloss',
                  early_stopping_rounds=1000,
                  verbose=1000)

    y_pred = estimator.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    scores.append(acc)

    test[target] += estimator.predict_proba(test[features]) / k

test[target].to_csv('intermediate_results.csv')

In [None]:
print(f"Expected score: {np.mean(scores)}")

In [None]:
print(f"Expected score: {np.std(scores)}")