In [1]:
import time
import datetime

import numpy as np
import pandas as pd
import optgbm as opt
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
from sklearn.preprocessing import LabelEncoder
features_target = ['target'] 

for feature in features_target:
    le = LabelEncoder()
    le.fit(train_df[feature])
    train_df[feature] = le.transform(train_df[feature])

In [4]:
print(len(train_df))
non_id_columns = train_df.columns[train_df.columns != 'id']
train_df = train_df.drop_duplicates(subset=non_id_columns)
print(len(train_df))

200000
199894


In [5]:
target = train_df['target']
train_features = train_df.drop(['target', 'id'], 1)

train_percent = 0.50

X_train, X_test, y_train, y_test = train_test_split(train_features, target, stratify=target, train_size=train_percent)
print('train count: ', len(y_train))
print('test count: ', len(y_test))

train count:  99947
test count:  99947


In [6]:
def do_transform(the_train, the_test, scaler):
    full_df = pd.concat([the_train, the_test])
    scaler.fit(full_df)
    return scaler.transform(the_train), scaler.transform(the_test)

In [7]:
def do_train(X_train, y_train):
    print(datetime.datetime.now())
    start = time.time()

    lgbm = opt.LGBMClassifier()
    lgbm.fit(X_train, y_train)

    minutes = (time.time() - start) / 60
    print(round(minutes, 2))
    print(datetime.datetime.now())
    
    return lgbm

In [8]:
def do_test(X_test, y_test, model):
    test_preds = model.predict_proba(X_test)
    print(abs(log_loss(y_test, test_preds)))
    return test_preds

In [9]:
def do_all(X_test, y_test, X_train, y_train, scaler):
    X_train_trainsformed, X_test_trainsformed = do_transform(X_train, X_test, s_scaler)
    model = do_train(X_train_trainsformed, y_train)
    preds = do_test(X_test_trainsformed, y_test, s_model)
    return model, preds

In [10]:
# sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import StandardScaler
s_scaler = StandardScaler()
s_model, s_preds = do_all(X_test, y_test, X_train, y_train, s_scaler)

[32m[I 2021-06-28 23:45:55,448][0m A new study created in memory with name: no-name-daa2acd4-0c6d-4a6e-9b4f-6cfbbecf923f[0m
Searching the best hyperparameters...


2021-06-28 23:45:55.433213


[32m[I 2021-06-28 23:46:21,586][0m Trial 0 finished with value: 1.7552949586991118 and parameters: {'feature_fraction': 0.6, 'max_depth': 3, 'num_leaves': 4, 'min_data_in_leaf': 5680, 'lambda_l1': 1.5316202768776057e-07, 'lambda_l2': 0.0005289346051310094, 'bagging_fraction': 0.8500000000000001, 'bagging_freq': 1}. Best is trial 0 with value: 1.7552949586991118.[0m
[32m[I 2021-06-28 23:46:52,157][0m Trial 1 finished with value: 1.7569122259875076 and parameters: {'feature_fraction': 0.2, 'max_depth': 2, 'num_leaves': 4, 'min_data_in_leaf': 918, 'lambda_l1': 6.054145915962369e-09, 'lambda_l2': 6.975801473493385e-07, 'bagging_fraction': 0.8, 'bagging_freq': 4}. Best is trial 0 with value: 1.7552949586991118.[0m
[32m[I 2021-06-28 23:47:14,581][0m Trial 2 finished with value: 1.7607065557108896 and parameters: {'feature_fraction': 1.0, 'max_depth': 3, 'num_leaves': 4, 'min_data_in_leaf': 15646, 'lambda_l1': 0.3470264453103109, 'lambda_l2': 6.520858852956798e-05, 'bagging_fraction':

KeyboardInterrupt: 

In [None]:
# sklearn.preprocessing.RobustScaler
from sklearn.preprocessing import RobustScaler
r_scaler = RobustScaler()
r_model, r_preds = do_all(X_test, y_test, X_train, y_train, r_scaler)

In [None]:
# sklearn.preprocessing.MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
mm_model, mm_preds = do_all(X_test, y_test, X_train, y_train, mm_scaler)

In [None]:
# sklearn.preprocessing.Normalizer
from sklearn.preprocessing import Normalizer
the_normalizer = Normalizer()
n_model, n_preds = do_all(X_test, y_test, X_train, y_train, the_normalizer)

In [None]:
# do nothing to data
base = do_train(X_train, y_train)
do_test(X_test, y_test, base)