In [9]:
import time
import datetime

import numpy as np
import pandas as pd
import optgbm as opt
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

In [10]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_features = train_df.drop(['target', 'id'], 1)
test_features = test_df.drop('id', 1)

In [60]:
def do_transform(the_train, the_test):
    full_df = pd.concat([the_train, the_test])
    for column in full_df:
        sorted_df = full_df.sort_values(by=column)
        encoder = OrdinalEncoder()
        encoder.fit(sorted_df[[column]])

        the_train[column + '_oe'] = encoder.transform(the_train[[column]])
        the_test[column + '_oe'] = encoder.transform(the_test[[column]])
        
    return the_train, the_test

In [61]:
train_transformed, test_trainsformed = do_transform(train_features, test_features)

In [62]:
print(train_transformed.columns)

Index(['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
       ...
       'feature_65_oe', 'feature_66_oe', 'feature_67_oe', 'feature_68_oe',
       'feature_69_oe', 'feature_70_oe', 'feature_71_oe', 'feature_72_oe',
       'feature_73_oe', 'feature_74_oe'],
      dtype='object', length=150)


In [63]:
print(datetime.datetime.now())
start = time.time()

X_train = train_transformed
y_train = train_df['target']

opt_obgm = opt.OGBMClassifier()
opt_obgm.fit(X_train, y_train)

minutes = (time.time() - start) / 60
print(round(minutes, 2))
print(datetime.datetime.now())

[32m[I 2021-06-28 00:26:19,231][0m A new study created in memory with name: no-name-7919fb4f-a81c-417a-985d-140d9f0848d7[0m
Searching the best hyperparameters...


2021-06-28 00:26:19.179340


[32m[I 2021-06-28 00:26:46,891][0m Trial 0 finished with value: 1.7641450432996912 and parameters: {'feature_fraction': 1.0, 'max_depth': 2, 'num_leaves': 4, 'min_data_in_leaf': 29119, 'lambda_l1': 4.552004749690557, 'lambda_l2': 7.537158135366, 'bagging_fraction': 0.6, 'bagging_freq': 5}. Best is trial 0 with value: 1.7641450432996912.[0m
[32m[I 2021-06-28 00:27:15,758][0m Trial 1 finished with value: 1.762740707782892 and parameters: {'feature_fraction': 0.9, 'max_depth': 2, 'num_leaves': 2, 'min_data_in_leaf': 17713, 'lambda_l1': 1.2973785008455102e-09, 'lambda_l2': 1.2766094858326051e-09, 'bagging_fraction': 0.65, 'bagging_freq': 2}. Best is trial 1 with value: 1.762740707782892.[0m
[32m[I 2021-06-28 00:29:19,213][0m Trial 2 finished with value: 1.7525920412970166 and parameters: {'feature_fraction': 0.4, 'max_depth': 3, 'num_leaves': 5, 'min_data_in_leaf': 21334, 'lambda_l1': 2.5286388393795987e-08, 'lambda_l2': 0.005579006248891394, 'bagging_fraction': 0.8500000000000001,

[32m[I 2021-06-28 01:00:02,204][0m Trial 23 finished with value: 1.7478866715230343 and parameters: {'feature_fraction': 0.2, 'max_depth': 6, 'num_leaves': 28, 'min_data_in_leaf': 5058, 'lambda_l1': 0.6967989953783513, 'lambda_l2': 9.076649373823146, 'bagging_fraction': 0.8500000000000001, 'bagging_freq': 4}. Best is trial 12 with value: 1.7470181328076921.[0m
[32m[I 2021-06-28 01:01:46,216][0m Trial 24 finished with value: 1.749244023882899 and parameters: {'feature_fraction': 0.30000000000000004, 'max_depth': 7, 'num_leaves': 83, 'min_data_in_leaf': 1590, 'lambda_l1': 1.254287062472376, 'lambda_l2': 0.0009733617816526471, 'bagging_fraction': 0.7, 'bagging_freq': 5}. Best is trial 12 with value: 1.7470181328076921.[0m
[32m[I 2021-06-28 01:03:31,494][0m Trial 25 finished with value: 1.7478021913537816 and parameters: {'feature_fraction': 0.15000000000000002, 'max_depth': 6, 'num_leaves': 31, 'min_data_in_leaf': 3546, 'lambda_l1': 3.492587478316759e-05, 'lambda_l2': 0.0477909259

60.18
2021-06-28 01:26:29.866183


In [64]:
print(datetime.datetime.now())
X_test = test_trainsformed
test_preds = opt_obgm.predict_proba(X_test)
print(datetime.datetime.now())

2021-06-28 01:42:38.277616
2021-06-28 01:42:39.636736


In [65]:
submission = pd.DataFrame(test_preds)
submission.columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']
submission['id'] = test_df['id']

submission.to_csv("submission_oe_opt_obgm.csv", index=False)