In [1]:
import time
import datetime

import numpy as np
import pandas as pd
import optgbm as opt
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
from sklearn.preprocessing import LabelEncoder
features_target = ['target'] 

for feature in features_target:
    le = LabelEncoder()
    le.fit(train_df[feature])
    train_df[feature] = le.transform(train_df[feature])

In [4]:
print(len(train_df))
non_id_columns = train_df.columns[train_df.columns != 'id']
train_df = train_df.drop_duplicates(subset=non_id_columns)
print(len(train_df))

200000
199894


In [5]:
target = train_df['target']
train_features = train_df.drop(['target', 'id'], 1)

train_percent = 0.50

X_train, X_test, y_train, y_test = train_test_split(train_features, target, stratify=target, train_size=train_percent)
print('train count: ', len(y_train))
print('test count: ', len(y_test))

train count:  99947
test count:  99947


In [6]:
def do_transform(the_train, the_test, scaler):
    full_df = pd.concat([the_train, the_test])
    scaler.fit(full_df)
    return scaler.transform(the_train), scaler.transform(the_test)

In [7]:
def do_train(X_train, y_train):
    print(datetime.datetime.now())
    start = time.time()

    lgbm = opt.LGBMClassifier()
    lgbm.fit(X_train, y_train)

    minutes = (time.time() - start) / 60
    print(round(minutes, 2))
    print(datetime.datetime.now())
    
    return lgbm

In [8]:
def do_test(X_test, y_test, model):
    test_preds = model.predict_proba(X_test)
    print(abs(log_loss(y_test, test_preds)))
    return test_preds

In [9]:
def do_all(X_test, y_test, X_train, y_train, scaler):
    X_train_trainsformed, X_test_trainsformed = do_transform(X_train, X_test, s_scaler)
    model = do_train(X_train_trainsformed, y_train)
    preds = do_test(X_test_trainsformed, y_test, model)
    return model, preds

In [10]:
# sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import StandardScaler
s_scaler = StandardScaler()
s_model, s_preds = do_all(X_test, y_test, X_train, y_train, s_scaler)

[32m[I 2021-06-29 00:21:10,299][0m A new study created in memory with name: no-name-e3d58dbe-cb50-4e35-9c9a-9319a571732b[0m
Searching the best hyperparameters...


2021-06-29 00:21:10.288211


[32m[I 2021-06-29 00:21:22,577][0m Trial 0 finished with value: 1.7639335682722972 and parameters: {'feature_fraction': 0.85, 'max_depth': 1, 'num_leaves': 2, 'min_data_in_leaf': 4796, 'lambda_l1': 1.9481674052467327e-05, 'lambda_l2': 7.65295001973986, 'bagging_fraction': 0.8, 'bagging_freq': 5}. Best is trial 0 with value: 1.7639335682722972.[0m
[32m[I 2021-06-29 00:21:48,961][0m Trial 1 finished with value: 1.7652741379161783 and parameters: {'feature_fraction': 0.30000000000000004, 'max_depth': 1, 'num_leaves': 2, 'min_data_in_leaf': 14370, 'lambda_l1': 3.6644174583805586e-05, 'lambda_l2': 0.0008907876552715858, 'bagging_fraction': 0.7, 'bagging_freq': 3}. Best is trial 0 with value: 1.7639335682722972.[0m
[32m[I 2021-06-29 00:22:13,471][0m Trial 2 finished with value: 1.752439045085882 and parameters: {'feature_fraction': 0.8, 'max_depth': 3, 'num_leaves': 8, 'min_data_in_leaf': 7111, 'lambda_l1': 0.000627995954613307, 'lambda_l2': 0.0011199422048567302, 'bagging_fraction':

[32m[I 2021-06-29 00:33:06,303][0m Trial 23 finished with value: 1.7520322988138168 and parameters: {'feature_fraction': 0.2, 'max_depth': 3, 'num_leaves': 6, 'min_data_in_leaf': 5749, 'lambda_l1': 9.90604140919904, 'lambda_l2': 4.1394314427728146e-07, 'bagging_fraction': 0.8500000000000001, 'bagging_freq': 4}. Best is trial 11 with value: 1.7507498908408583.[0m
[32m[I 2021-06-29 00:33:48,527][0m Trial 24 finished with value: 1.751330497253822 and parameters: {'feature_fraction': 0.35, 'max_depth': 5, 'num_leaves': 20, 'min_data_in_leaf': 2819, 'lambda_l1': 0.41264668082108086, 'lambda_l2': 6.337239113898889e-09, 'bagging_fraction': 0.65, 'bagging_freq': 6}. Best is trial 11 with value: 1.7507498908408583.[0m
[32m[I 2021-06-29 00:34:32,346][0m Trial 25 finished with value: 1.7516870559079578 and parameters: {'feature_fraction': 0.4, 'max_depth': 4, 'num_leaves': 11, 'min_data_in_leaf': 4196, 'lambda_l1': 0.00022068831060077996, 'lambda_l2': 1.8659210983618104e-05, 'bagging_frac

23.02
2021-06-29 00:44:11.736007
1.7493061424496734


In [11]:
# sklearn.preprocessing.RobustScaler
from sklearn.preprocessing import RobustScaler
r_scaler = RobustScaler()
r_model, r_preds = do_all(X_test, y_test, X_train, y_train, r_scaler)

[32m[I 2021-06-29 00:44:13,668][0m A new study created in memory with name: no-name-b028abd8-3cbd-4951-89bf-1ff8ece2572c[0m
Searching the best hyperparameters...


2021-06-29 00:44:13.654486


[32m[I 2021-06-29 00:44:38,999][0m Trial 0 finished with value: 1.752735842442296 and parameters: {'feature_fraction': 0.75, 'max_depth': 6, 'num_leaves': 31, 'min_data_in_leaf': 1610, 'lambda_l1': 8.464653842780308e-07, 'lambda_l2': 0.0628773336081306, 'bagging_fraction': 0.6, 'bagging_freq': 3}. Best is trial 0 with value: 1.752735842442296.[0m
[32m[I 2021-06-29 00:45:15,162][0m Trial 1 finished with value: 1.7539922180052199 and parameters: {'feature_fraction': 0.25, 'max_depth': 2, 'num_leaves': 4, 'min_data_in_leaf': 3695, 'lambda_l1': 2.5515168025967488e-09, 'lambda_l2': 1.0136355000393835, 'bagging_fraction': 0.65, 'bagging_freq': 1}. Best is trial 0 with value: 1.752735842442296.[0m
[32m[I 2021-06-29 00:45:59,848][0m Trial 2 finished with value: 1.7516990898285745 and parameters: {'feature_fraction': 0.35, 'max_depth': 3, 'num_leaves': 7, 'min_data_in_leaf': 2961, 'lambda_l1': 0.0007070863500725007, 'lambda_l2': 1.7862936917343375e-09, 'bagging_fraction': 0.9, 'bagging_

[32m[I 2021-06-29 01:00:06,517][0m Trial 23 finished with value: 1.7510551989981535 and parameters: {'feature_fraction': 0.30000000000000004, 'max_depth': 6, 'num_leaves': 35, 'min_data_in_leaf': 2067, 'lambda_l1': 9.11232766464077e-09, 'lambda_l2': 1.3700344424916127e-08, 'bagging_fraction': 0.9, 'bagging_freq': 8}. Best is trial 15 with value: 1.750697125096536.[0m
[32m[I 2021-06-29 01:01:01,891][0m Trial 24 finished with value: 1.7515635447365416 and parameters: {'feature_fraction': 0.45000000000000007, 'max_depth': 4, 'num_leaves': 16, 'min_data_in_leaf': 5491, 'lambda_l1': 1.78433362770519e-07, 'lambda_l2': 3.305125899525349e-07, 'bagging_fraction': 0.95, 'bagging_freq': 9}. Best is trial 15 with value: 1.750697125096536.[0m
[32m[I 2021-06-29 01:01:36,130][0m Trial 25 finished with value: 1.7513560534881438 and parameters: {'feature_fraction': 0.15000000000000002, 'max_depth': 5, 'num_leaves': 20, 'min_data_in_leaf': 4974, 'lambda_l1': 2.553015994579268e-08, 'lambda_l2': 0

26.74
2021-06-29 01:10:58.097073
1.7489021090251153


In [12]:
# sklearn.preprocessing.MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
mm_model, mm_preds = do_all(X_test, y_test, X_train, y_train, mm_scaler)

[32m[I 2021-06-29 01:11:00,202][0m A new study created in memory with name: no-name-9f19a523-e77c-4450-bdd9-db4e8d53cc34[0m
Searching the best hyperparameters...


2021-06-29 01:11:00.188573


[32m[I 2021-06-29 01:11:20,634][0m Trial 0 finished with value: 1.7671590460424642 and parameters: {'feature_fraction': 0.6, 'max_depth': 1, 'num_leaves': 2, 'min_data_in_leaf': 15385, 'lambda_l1': 0.0008249311597741806, 'lambda_l2': 3.0541921795844616e-05, 'bagging_fraction': 0.7, 'bagging_freq': 10}. Best is trial 0 with value: 1.7671590460424642.[0m
[32m[I 2021-06-29 01:11:42,603][0m Trial 1 finished with value: 1.7590215065549752 and parameters: {'feature_fraction': 0.6, 'max_depth': 7, 'num_leaves': 115, 'min_data_in_leaf': 239, 'lambda_l1': 3.5677007990132073e-09, 'lambda_l2': 0.0004870285908221216, 'bagging_fraction': 0.6, 'bagging_freq': 8}. Best is trial 1 with value: 1.7590215065549752.[0m
[32m[I 2021-06-29 01:12:17,230][0m Trial 2 finished with value: 1.751606713932146 and parameters: {'feature_fraction': 0.2, 'max_depth': 5, 'num_leaves': 16, 'min_data_in_leaf': 3733, 'lambda_l1': 1.788570789507071e-05, 'lambda_l2': 0.000738251959879624, 'bagging_fraction': 0.65, 'b

[32m[I 2021-06-29 01:24:37,300][0m Trial 23 finished with value: 1.7501850978501872 and parameters: {'feature_fraction': 0.15000000000000002, 'max_depth': 5, 'num_leaves': 21, 'min_data_in_leaf': 2127, 'lambda_l1': 0.003061103672916891, 'lambda_l2': 1.6292100817480127e-08, 'bagging_fraction': 0.8, 'bagging_freq': 1}. Best is trial 18 with value: 1.7501203500801814.[0m
[32m[I 2021-06-29 01:25:06,722][0m Trial 24 finished with value: 1.7522038117218148 and parameters: {'feature_fraction': 0.1, 'max_depth': 4, 'num_leaves': 13, 'min_data_in_leaf': 2195, 'lambda_l1': 0.005387254482265552, 'lambda_l2': 1.396375787829532e-09, 'bagging_fraction': 0.8, 'bagging_freq': 1}. Best is trial 18 with value: 1.7501203500801814.[0m
[32m[I 2021-06-29 01:25:42,347][0m Trial 25 finished with value: 1.7504404371784699 and parameters: {'feature_fraction': 0.15000000000000002, 'max_depth': 5, 'num_leaves': 22, 'min_data_in_leaf': 1960, 'lambda_l1': 0.0005821019407155449, 'lambda_l2': 0.11095675789656

23.1
2021-06-29 01:34:06.209114
1.748532183865077


In [13]:
# sklearn.preprocessing.Normalizer
from sklearn.preprocessing import Normalizer
the_normalizer = Normalizer()
n_model, n_preds = do_all(X_test, y_test, X_train, y_train, the_normalizer)

[32m[I 2021-06-29 01:34:08,197][0m A new study created in memory with name: no-name-20637bf7-b507-47db-9df5-fcf47e93da30[0m
Searching the best hyperparameters...


2021-06-29 01:34:08.183865


[32m[I 2021-06-29 01:34:31,175][0m Trial 0 finished with value: 1.7568680092817224 and parameters: {'feature_fraction': 0.5, 'max_depth': 6, 'num_leaves': 39, 'min_data_in_leaf': 316, 'lambda_l1': 0.0011078523102291912, 'lambda_l2': 5.729242841835486e-07, 'bagging_fraction': 0.5, 'bagging_freq': 9}. Best is trial 0 with value: 1.7568680092817224.[0m
[32m[I 2021-06-29 01:34:56,024][0m Trial 1 finished with value: 1.7529458656740484 and parameters: {'feature_fraction': 0.8, 'max_depth': 5, 'num_leaves': 12, 'min_data_in_leaf': 8091, 'lambda_l1': 4.501526507906666, 'lambda_l2': 0.0025224348967209705, 'bagging_fraction': 0.7, 'bagging_freq': 4}. Best is trial 1 with value: 1.7529458656740484.[0m
[32m[I 2021-06-29 01:35:10,900][0m Trial 2 finished with value: 1.7630318944575554 and parameters: {'feature_fraction': 0.7000000000000001, 'max_depth': 4, 'num_leaves': 3, 'min_data_in_leaf': 12164, 'lambda_l1': 2.8425314125029986e-08, 'lambda_l2': 0.5999605228740887, 'bagging_fraction': 0

[32m[I 2021-06-29 01:45:59,113][0m Trial 23 finished with value: 1.7505121662532954 and parameters: {'feature_fraction': 0.45000000000000007, 'max_depth': 5, 'num_leaves': 24, 'min_data_in_leaf': 3108, 'lambda_l1': 8.296682179247139, 'lambda_l2': 0.026114211251894146, 'bagging_fraction': 0.8500000000000001, 'bagging_freq': 5}. Best is trial 22 with value: 1.750254813734653.[0m
[32m[I 2021-06-29 01:46:55,270][0m Trial 24 finished with value: 1.7512346742417564 and parameters: {'feature_fraction': 0.45000000000000007, 'max_depth': 4, 'num_leaves': 16, 'min_data_in_leaf': 4005, 'lambda_l1': 1.7357483701608072, 'lambda_l2': 1.3200529601468172, 'bagging_fraction': 0.95, 'bagging_freq': 5}. Best is trial 22 with value: 1.750254813734653.[0m
[32m[I 2021-06-29 01:47:35,820][0m Trial 25 finished with value: 1.7508719295723811 and parameters: {'feature_fraction': 0.6, 'max_depth': 5, 'num_leaves': 22, 'min_data_in_leaf': 3051, 'lambda_l1': 8.665620937266626, 'lambda_l2': 0.00059747766875

23.7
2021-06-29 01:57:50.071466
1.748080002371117


In [14]:
# do nothing to data
base = do_train(X_train, y_train)
do_test(X_test, y_test, base)

[32m[I 2021-06-29 01:57:52,012][0m A new study created in memory with name: no-name-e3d52dbb-9a4b-411a-8729-6e876cabd7c1[0m
Searching the best hyperparameters...


2021-06-29 01:57:51.991728


[32m[I 2021-06-29 01:58:36,320][0m Trial 0 finished with value: 1.7513615940697114 and parameters: {'feature_fraction': 0.4, 'max_depth': 6, 'num_leaves': 48, 'min_data_in_leaf': 1909, 'lambda_l1': 2.2131975276087785, 'lambda_l2': 1.1668309414051117e-08, 'bagging_fraction': 0.75, 'bagging_freq': 7}. Best is trial 0 with value: 1.7513615940697114.[0m
[32m[I 2021-06-29 01:58:53,708][0m Trial 1 finished with value: 1.7636431673069404 and parameters: {'feature_fraction': 0.8, 'max_depth': 4, 'num_leaves': 5, 'min_data_in_leaf': 18286, 'lambda_l1': 3.430126155682148e-07, 'lambda_l2': 4.355883902321961e-08, 'bagging_fraction': 0.8, 'bagging_freq': 6}. Best is trial 0 with value: 1.7513615940697114.[0m
[32m[I 2021-06-29 01:59:13,307][0m Trial 2 finished with value: 1.7551602282318934 and parameters: {'feature_fraction': 0.9500000000000001, 'max_depth': 5, 'num_leaves': 31, 'min_data_in_leaf': 1576, 'lambda_l1': 0.00015672892410501104, 'lambda_l2': 4.873229845531388e-07, 'bagging_fract

[32m[I 2021-06-29 02:11:25,462][0m Trial 23 finished with value: 1.7512181136742981 and parameters: {'feature_fraction': 0.30000000000000004, 'max_depth': 5, 'num_leaves': 16, 'min_data_in_leaf': 4896, 'lambda_l1': 0.003764465169600782, 'lambda_l2': 0.0005779030049657008, 'bagging_fraction': 0.8, 'bagging_freq': 7}. Best is trial 8 with value: 1.7507914000920626.[0m
[32m[I 2021-06-29 02:12:23,462][0m Trial 24 finished with value: 1.7514271725658759 and parameters: {'feature_fraction': 0.45000000000000007, 'max_depth': 6, 'num_leaves': 19, 'min_data_in_leaf': 4799, 'lambda_l1': 6.458498845461467e-05, 'lambda_l2': 0.016673454148543345, 'bagging_fraction': 0.8500000000000001, 'bagging_freq': 5}. Best is trial 8 with value: 1.7507914000920626.[0m
[32m[I 2021-06-29 02:12:56,297][0m Trial 25 finished with value: 1.7537529143048924 and parameters: {'feature_fraction': 0.2, 'max_depth': 3, 'num_leaves': 8, 'min_data_in_leaf': 8226, 'lambda_l1': 0.22867753656801237, 'lambda_l2': 5.75351

23.64
2021-06-29 02:21:30.589224
1.7485294661546393


array([[0.02387077, 0.01783118, 0.0152909 , ..., 0.05110071, 0.21169561,
        0.05152337],
       [0.03727782, 0.04332348, 0.04273437, ..., 0.08331729, 0.28000717,
        0.10058811],
       [0.03828316, 0.02640484, 0.0205466 , ..., 0.09690139, 0.48066512,
        0.08846737],
       ...,
       [0.04533367, 0.03506324, 0.03849023, ..., 0.09728498, 0.38061466,
        0.13545339],
       [0.04634373, 0.17672905, 0.11701944, ..., 0.05458129, 0.20125497,
        0.15897843],
       [0.04587652, 0.04569913, 0.03700641, ..., 0.10268204, 0.42361828,
        0.11791756]])