In [1]:
!pip install optgbm



In [2]:
import time
import datetime

import numpy as np
import pandas as pd
import optgbm as opt

from sklearn import preprocessing
from sklearn.metrics import log_loss

import gc
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import activations,callbacks
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import initializers

from keras.models import Model

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

non_id_columns = train_df.columns[train_df.columns != 'id']

In [4]:
from sklearn.preprocessing import LabelEncoder
features_target = ['target'] 

for feature in features_target:
    le = LabelEncoder()
    le.fit(train_df[feature])
    train_df[feature] = le.transform(train_df[feature])
    
print(train_df['target'].head())

0    5
1    5
2    1
3    7
4    1
Name: target, dtype: int64


In [5]:
train_knn = np.load("add_feat_train.npy")
test_knn = np.load("add_feat_test.npy")

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_knn = scaler.fit_transform(train_knn)
test_knn = scaler.transform(test_knn)

In [6]:
train_df = pd.concat([train_df, 
                      pd.DataFrame(train_knn, columns = ['knn_1', 'knn_2', 'knn_3', 'knn_4', 'knn_5', 'knn_6', 'knn_7', 'knn_8', 'knn_9'])], axis = 1)
test_df = pd.concat([test_df, 
                     pd.DataFrame(test_knn, columns = ['knn_1', 'knn_2', 'knn_3', 'knn_4', 'knn_5', 'knn_6', 'knn_7', 'knn_8', 'knn_9'])], axis = 1,)

In [7]:
print(len(train_df))
train_df = train_df.drop_duplicates(subset=non_id_columns)
print(len(train_df))

200000
199894


In [8]:
def do_run(i, train_df, test_df):
    print('starting run', i)
    print(datetime.datetime.now())
    start = time.time()

    X_train = train_df.drop(['id', 'target'], 1)
    y_train = train_df['target']

    opt_obgm = opt.OGBMClassifier()
    opt_obgm.fit(X_train, y_train)

    minutes = (time.time() - start) / 60
    print(round(minutes, 2))
    print(datetime.datetime.now())
    
    X_test = test_df.drop('id', 1)
    test_preds = opt_obgm.predict_proba(X_test)
    return test_preds

In [9]:
n_runs = 5
test_result = do_run(1, train_df, test_df)

for i in range(2, n_runs+1):
    loop_test_result = do_run(i, train_df, test_df)
    test_result = test_result + loop_test_result

test_result = test_result / n_runs

[32m[I 2021-06-30 02:54:32,739][0m A new study created in memory with name: no-name-cee4b835-80bc-4fc4-ae45-f2e04e386798[0m
Searching the best hyperparameters...


starting run 1
2021-06-30 02:54:32.695141


[32m[I 2021-06-30 02:55:38,338][0m Trial 0 finished with value: 1.7513021276673264 and parameters: {'feature_fraction': 0.4, 'max_depth': 3, 'num_leaves': 8, 'min_data_in_leaf': 12904, 'lambda_l1': 0.07020973712676813, 'lambda_l2': 0.012657687970392292, 'bagging_fraction': 0.55, 'bagging_freq': 3}. Best is trial 0 with value: 1.7513021276673264.[0m
[32m[I 2021-06-30 02:56:40,811][0m Trial 1 finished with value: 1.7518556227177657 and parameters: {'feature_fraction': 0.8, 'max_depth': 7, 'num_leaves': 122, 'min_data_in_leaf': 803, 'lambda_l1': 7.032334200360548e-07, 'lambda_l2': 0.0008873230778094086, 'bagging_fraction': 0.8, 'bagging_freq': 9}. Best is trial 0 with value: 1.7513021276673264.[0m
[32m[I 2021-06-30 02:58:18,159][0m Trial 2 finished with value: 1.7564587851661713 and parameters: {'feature_fraction': 0.35, 'max_depth': 2, 'num_leaves': 3, 'min_data_in_leaf': 29085, 'lambda_l1': 0.008314788968686737, 'lambda_l2': 6.387930101696766e-07, 'bagging_fraction': 0.9, 'baggi

[32m[I 2021-06-30 03:23:14,470][0m Trial 23 finished with value: 1.748074106455297 and parameters: {'feature_fraction': 0.30000000000000004, 'max_depth': 5, 'num_leaves': 19, 'min_data_in_leaf': 6502, 'lambda_l1': 0.007083089616871473, 'lambda_l2': 4.306918746061075e-09, 'bagging_fraction': 0.7, 'bagging_freq': 5}. Best is trial 11 with value: 1.747840315589913.[0m
[32m[I 2021-06-30 03:24:25,668][0m Trial 24 finished with value: 1.7490699849303168 and parameters: {'feature_fraction': 0.15000000000000002, 'max_depth': 7, 'num_leaves': 78, 'min_data_in_leaf': 1205, 'lambda_l1': 3.601572665461864e-06, 'lambda_l2': 4.4075044229816723e-07, 'bagging_fraction': 0.6, 'bagging_freq': 6}. Best is trial 11 with value: 1.747840315589913.[0m
[32m[I 2021-06-30 03:26:07,225][0m Trial 25 finished with value: 1.748233307382487 and parameters: {'feature_fraction': 0.35, 'max_depth': 6, 'num_leaves': 44, 'min_data_in_leaf': 3451, 'lambda_l1': 0.0001676762975841428, 'lambda_l2': 9.35372711495996e-

52.4
2021-06-30 03:46:56.919648


[32m[I 2021-06-30 03:46:59,250][0m A new study created in memory with name: no-name-a8be48aa-8a7d-44d5-a495-0f6ac2f168c7[0m
Searching the best hyperparameters...


starting run 2
2021-06-30 03:46:59.197295


[32m[I 2021-06-30 03:48:24,065][0m Trial 0 finished with value: 1.7523459416299865 and parameters: {'feature_fraction': 0.5, 'max_depth': 7, 'num_leaves': 111, 'min_data_in_leaf': 286, 'lambda_l1': 0.015089286325753086, 'lambda_l2': 7.5388539644814665e-06, 'bagging_fraction': 0.9, 'bagging_freq': 1}. Best is trial 0 with value: 1.7523459416299865.[0m
[32m[I 2021-06-30 03:48:50,394][0m Trial 1 finished with value: 1.7629631508999182 and parameters: {'feature_fraction': 1.0, 'max_depth': 4, 'num_leaves': 2, 'min_data_in_leaf': 21000, 'lambda_l1': 0.5236096395387936, 'lambda_l2': 0.005960513909612552, 'bagging_fraction': 0.65, 'bagging_freq': 7}. Best is trial 0 with value: 1.7523459416299865.[0m
[32m[I 2021-06-30 03:48:51,885][0m Trial 2 finished with value: 2.075293967415169 and parameters: {'feature_fraction': 0.35, 'max_depth': 2, 'num_leaves': 2, 'min_data_in_leaf': 98070, 'lambda_l1': 4.758476909725089e-06, 'lambda_l2': 0.012494621187485119, 'bagging_fraction': 0.65, 'baggin

[32m[I 2021-06-30 04:17:19,825][0m Trial 23 finished with value: 1.7497910651602595 and parameters: {'feature_fraction': 0.45000000000000007, 'max_depth': 3, 'num_leaves': 6, 'min_data_in_leaf': 12909, 'lambda_l1': 0.0001545449719172644, 'lambda_l2': 0.0016491339656742319, 'bagging_fraction': 0.9, 'bagging_freq': 4}. Best is trial 22 with value: 1.7483747421485365.[0m
[32m[I 2021-06-30 04:19:35,747][0m Trial 24 finished with value: 1.7484625346845941 and parameters: {'feature_fraction': 0.55, 'max_depth': 5, 'num_leaves': 32, 'min_data_in_leaf': 6245, 'lambda_l1': 0.13184116838068607, 'lambda_l2': 0.05195185128298043, 'bagging_fraction': 0.9, 'bagging_freq': 4}. Best is trial 22 with value: 1.7483747421485365.[0m
[32m[I 2021-06-30 04:21:19,554][0m Trial 25 finished with value: 1.749338248461355 and parameters: {'feature_fraction': 0.30000000000000004, 'max_depth': 3, 'num_leaves': 7, 'min_data_in_leaf': 11014, 'lambda_l1': 0.0011520317023110084, 'lambda_l2': 0.14991374645215133

59.59
2021-06-30 04:46:34.367831


[32m[I 2021-06-30 04:46:36,089][0m A new study created in memory with name: no-name-caa5f73f-c02b-482e-8c56-6687d232e318[0m
Searching the best hyperparameters...


starting run 3
2021-06-30 04:46:36.039749


[32m[I 2021-06-30 04:47:58,919][0m Trial 0 finished with value: 1.7483565778529306 and parameters: {'feature_fraction': 0.25, 'max_depth': 6, 'num_leaves': 52, 'min_data_in_leaf': 2781, 'lambda_l1': 6.051859505479103e-07, 'lambda_l2': 6.312113694775676e-08, 'bagging_fraction': 0.55, 'bagging_freq': 3}. Best is trial 0 with value: 1.7483565778529306.[0m
[32m[I 2021-06-30 04:48:39,396][0m Trial 1 finished with value: 1.811762130043709 and parameters: {'feature_fraction': 0.1, 'max_depth': 1, 'num_leaves': 2, 'min_data_in_leaf': 53259, 'lambda_l1': 1.5620776061109168e-07, 'lambda_l2': 1.1566260451565558, 'bagging_fraction': 0.75, 'bagging_freq': 2}. Best is trial 0 with value: 1.7483565778529306.[0m
[32m[I 2021-06-30 04:50:06,807][0m Trial 2 finished with value: 1.747762003375446 and parameters: {'feature_fraction': 0.15000000000000002, 'max_depth': 5, 'num_leaves': 24, 'min_data_in_leaf': 7703, 'lambda_l1': 0.014972339497572172, 'lambda_l2': 1.419508504767615, 'bagging_fraction':

[32m[I 2021-06-30 05:18:00,266][0m Trial 23 finished with value: 1.7471538478559157 and parameters: {'feature_fraction': 0.15000000000000002, 'max_depth': 5, 'num_leaves': 32, 'min_data_in_leaf': 1482, 'lambda_l1': 4.585150508548567, 'lambda_l2': 0.0019541971974646, 'bagging_fraction': 0.95, 'bagging_freq': 2}. Best is trial 23 with value: 1.7471538478559157.[0m
[32m[I 2021-06-30 05:19:13,413][0m Trial 24 finished with value: 1.7505473352506482 and parameters: {'feature_fraction': 0.1, 'max_depth': 4, 'num_leaves': 16, 'min_data_in_leaf': 8500, 'lambda_l1': 1.6367080197258856, 'lambda_l2': 0.14658358393070334, 'bagging_fraction': 0.95, 'bagging_freq': 2}. Best is trial 23 with value: 1.7471538478559157.[0m
[32m[I 2021-06-30 05:21:14,718][0m Trial 25 finished with value: 1.7484307478545436 and parameters: {'feature_fraction': 0.35, 'max_depth': 5, 'num_leaves': 29, 'min_data_in_leaf': 1923, 'lambda_l1': 0.07326215608917855, 'lambda_l2': 0.003747194278282382, 'bagging_fraction': 

54.22
2021-06-30 05:40:49.339245


[32m[I 2021-06-30 05:40:51,303][0m A new study created in memory with name: no-name-74655212-67f4-4acf-8363-d1d71188fe39[0m
Searching the best hyperparameters...


starting run 4
2021-06-30 05:40:51.251574


[32m[I 2021-06-30 05:40:53,204][0m Trial 0 finished with value: 2.075293967415169 and parameters: {'feature_fraction': 0.9, 'max_depth': 1, 'num_leaves': 2, 'min_data_in_leaf': 88587, 'lambda_l1': 2.0486561277391055e-09, 'lambda_l2': 9.097709410448779e-06, 'bagging_fraction': 0.55, 'bagging_freq': 4}. Best is trial 0 with value: 2.075293967415169.[0m
[32m[I 2021-06-30 05:42:15,209][0m Trial 1 finished with value: 1.7495362678018627 and parameters: {'feature_fraction': 0.6, 'max_depth': 7, 'num_leaves': 57, 'min_data_in_leaf': 2987, 'lambda_l1': 0.005995587284258459, 'lambda_l2': 6.551894293288643, 'bagging_fraction': 0.65, 'bagging_freq': 10}. Best is trial 1 with value: 1.7495362678018627.[0m
[32m[I 2021-06-30 05:43:15,740][0m Trial 2 finished with value: 1.748926649968816 and parameters: {'feature_fraction': 0.9, 'max_depth': 4, 'num_leaves': 9, 'min_data_in_leaf': 2731, 'lambda_l1': 3.6170962888300247e-07, 'lambda_l2': 9.043378110036641e-07, 'bagging_fraction': 0.8, 'bagging

[32m[I 2021-06-30 06:10:22,262][0m Trial 23 finished with value: 1.7477764040170514 and parameters: {'feature_fraction': 0.2, 'max_depth': 7, 'num_leaves': 38, 'min_data_in_leaf': 4401, 'lambda_l1': 5.0567904817796885e-05, 'lambda_l2': 4.766586028691597e-08, 'bagging_fraction': 0.9, 'bagging_freq': 6}. Best is trial 17 with value: 1.7476247123438928.[0m
[32m[I 2021-06-30 06:12:15,156][0m Trial 24 finished with value: 1.7474638474057897 and parameters: {'feature_fraction': 0.15000000000000002, 'max_depth': 7, 'num_leaves': 34, 'min_data_in_leaf': 5144, 'lambda_l1': 0.0006955098226612623, 'lambda_l2': 5.576028803148887e-08, 'bagging_fraction': 0.9, 'bagging_freq': 5}. Best is trial 24 with value: 1.7474638474057897.[0m
[32m[I 2021-06-30 06:13:39,161][0m Trial 25 finished with value: 1.750339377558013 and parameters: {'feature_fraction': 0.1, 'max_depth': 7, 'num_leaves': 69, 'min_data_in_leaf': 2737, 'lambda_l1': 0.0012792884812850946, 'lambda_l2': 4.153054474532345e-08, 'bagging

53.24
2021-06-30 06:34:05.504941


[32m[I 2021-06-30 06:34:07,678][0m A new study created in memory with name: no-name-9ba7c46e-3132-4b92-b03b-d4861c4a0078[0m
Searching the best hyperparameters...


starting run 5
2021-06-30 06:34:07.609394


[32m[I 2021-06-30 06:35:18,334][0m Trial 0 finished with value: 1.7491329643644096 and parameters: {'feature_fraction': 0.9, 'max_depth': 7, 'num_leaves': 39, 'min_data_in_leaf': 5107, 'lambda_l1': 0.14780729071539553, 'lambda_l2': 2.9307285864920956, 'bagging_fraction': 0.7, 'bagging_freq': 7}. Best is trial 0 with value: 1.7491329643644096.[0m
[32m[I 2021-06-30 06:36:34,840][0m Trial 1 finished with value: 1.7567538747447144 and parameters: {'feature_fraction': 0.25, 'max_depth': 3, 'num_leaves': 5, 'min_data_in_leaf': 28187, 'lambda_l1': 1.6525974317150166e-08, 'lambda_l2': 5.453302061476086e-05, 'bagging_fraction': 0.8, 'bagging_freq': 6}. Best is trial 0 with value: 1.7491329643644096.[0m
[32m[I 2021-06-30 06:37:05,894][0m Trial 2 finished with value: 1.7628293376093196 and parameters: {'feature_fraction': 0.9, 'max_depth': 3, 'num_leaves': 2, 'min_data_in_leaf': 23948, 'lambda_l1': 2.4424278966580415, 'lambda_l2': 9.411997737278678e-06, 'bagging_fraction': 0.8, 'bagging_f

[32m[I 2021-06-30 07:01:31,410][0m Trial 23 finished with value: 1.7487649981434696 and parameters: {'feature_fraction': 0.8, 'max_depth': 5, 'num_leaves': 21, 'min_data_in_leaf': 9316, 'lambda_l1': 9.854054867555123e-08, 'lambda_l2': 0.0014218460849587844, 'bagging_fraction': 0.9, 'bagging_freq': 9}. Best is trial 11 with value: 1.748572892194921.[0m
[32m[I 2021-06-30 07:02:51,865][0m Trial 24 finished with value: 1.7486618880761235 and parameters: {'feature_fraction': 0.85, 'max_depth': 6, 'num_leaves': 34, 'min_data_in_leaf': 5867, 'lambda_l1': 3.8013671481870947e-06, 'lambda_l2': 0.023308185144919363, 'bagging_fraction': 0.95, 'bagging_freq': 8}. Best is trial 11 with value: 1.748572892194921.[0m
[32m[I 2021-06-30 07:04:08,606][0m Trial 25 finished with value: 1.7490759185915863 and parameters: {'feature_fraction': 1.0, 'max_depth': 6, 'num_leaves': 34, 'min_data_in_leaf': 5765, 'lambda_l1': 4.621686937753445e-06, 'lambda_l2': 0.02423231884550369, 'bagging_fraction': 0.95, 

48.94
2021-06-30 07:23:04.062426


In [11]:
submission = pd.DataFrame(test_result)
submission.columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']
submission['id'] = test_df['id']

submission.to_csv("submission_opt_obgm_knn_ensemble.csv", index=False)