In [1]:
!pip install optgbm



In [2]:
import time
import datetime

import numpy as np
import pandas as pd
import optgbm as lgb

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [4]:
from sklearn.preprocessing import LabelEncoder
features_target = ['target'] 

for feature in features_target:
    le = LabelEncoder()
    le.fit(train_df[feature])
    train_df[feature] = le.transform(train_df[feature])
    
print(train_df['target'].head())

0    5
1    5
2    1
3    7
4    1
Name: target, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split
train_no_predict = train_df.drop(['id', 'target'], 1)
train_predict = train_df['target']

train_percent = 0.50

X_train, X_test, y_train, y_test = train_test_split(train_no_predict, train_predict, stratify=train_predict, train_size=train_percent)
print('train count: ', len(y_train))
print('test count: ', len(y_test))

train count:  100000
test count:  100000


In [6]:
from sklearn.neighbors import LocalOutlierFactor
print(datetime.datetime.now())
start = time.time()

column_count = len(X_train.columns)
lof = LocalOutlierFactor(n_neighbors=column_count)
yhat = lof.fit_predict(X_train)

minutes = (time.time() - start) / 60
print(round(minutes, 2))
print(datetime.datetime.now())

2021-06-28 18:35:15.487523
3.55
2021-06-28 18:38:48.263041


In [7]:
len(y_train)

100000

In [8]:
mask = yhat != -1
X_train_non_outliers, y_train_non_outliers = X_train.iloc[mask, :], y_train.iloc[mask]

print('without outliers', len(X_train_non_outliers))
print('with outliers', len(X_train))

without outliers 52619
with outliers 100000


## With outliers

In [9]:
print(datetime.datetime.now())
start = time.time()

lgbm = lgb.LGBMClassifier()
lgbm.fit(X_train, y_train)

minutes = (time.time() - start) / 60
print(round(minutes, 2))
print(datetime.datetime.now())

[32m[I 2021-06-28 18:38:48,340][0m A new study created in memory with name: no-name-a48ceedd-b65f-4a68-a8d3-3162af66dcd1[0m
Searching the best hyperparameters...


2021-06-28 18:38:48.320249


[32m[I 2021-06-28 18:39:06,339][0m Trial 0 finished with value: 1.7646115119783663 and parameters: {'feature_fraction': 1.0, 'max_depth': 1, 'num_leaves': 2, 'min_data_in_leaf': 11919, 'lambda_l1': 9.473638581982025e-07, 'lambda_l2': 4.631716901174686e-09, 'bagging_fraction': 0.75, 'bagging_freq': 4}. Best is trial 0 with value: 1.7646115119783663.[0m
[32m[I 2021-06-28 18:40:04,975][0m Trial 1 finished with value: 1.7645922447026492 and parameters: {'feature_fraction': 0.4, 'max_depth': 1, 'num_leaves': 2, 'min_data_in_leaf': 5104, 'lambda_l1': 2.1046335185526698e-05, 'lambda_l2': 7.9188082368761e-06, 'bagging_fraction': 0.95, 'bagging_freq': 9}. Best is trial 1 with value: 1.7645922447026492.[0m
[32m[I 2021-06-28 18:40:42,975][0m Trial 2 finished with value: 1.7574055752142137 and parameters: {'feature_fraction': 0.9, 'max_depth': 2, 'num_leaves': 3, 'min_data_in_leaf': 13514, 'lambda_l1': 1.9643362448879027e-05, 'lambda_l2': 1.1008746733379623e-08, 'bagging_fraction': 0.8, 'b

[32m[I 2021-06-28 18:54:07,670][0m Trial 23 finished with value: 1.7503618195919866 and parameters: {'feature_fraction': 0.2, 'max_depth': 6, 'num_leaves': 40, 'min_data_in_leaf': 2211, 'lambda_l1': 5.998106814820692e-08, 'lambda_l2': 0.5701570342716147, 'bagging_fraction': 0.8, 'bagging_freq': 8}. Best is trial 23 with value: 1.7503618195919866.[0m
[32m[I 2021-06-28 18:54:37,706][0m Trial 24 finished with value: 1.7523784810560794 and parameters: {'feature_fraction': 0.1, 'max_depth': 5, 'num_leaves': 24, 'min_data_in_leaf': 2980, 'lambda_l1': 7.930687667895634e-08, 'lambda_l2': 0.042264830985325436, 'bagging_fraction': 0.8, 'bagging_freq': 8}. Best is trial 23 with value: 1.7503618195919866.[0m
[32m[I 2021-06-28 18:55:13,924][0m Trial 25 finished with value: 1.7517346206667148 and parameters: {'feature_fraction': 0.35, 'max_depth': 6, 'num_leaves': 48, 'min_data_in_leaf': 1689, 'lambda_l1': 1.1200954451895484e-08, 'lambda_l2': 1.019286204365929, 'bagging_fraction': 0.75, 'bag

24.34
2021-06-28 19:03:08.498420


In [10]:
from sklearn.metrics import log_loss
test_preds = lgbm.predict_proba(X_test)
print(abs(log_loss(y_test, test_preds)))

1.748085512373771


## Without outliers

In [11]:
print(datetime.datetime.now())
start = time.time()

lgbm_no_outliers = lgb.LGBMClassifier()
lgbm_no_outliers.fit(X_train_non_outliers, y_train_non_outliers)

minutes = (time.time() - start) / 60
print(round(minutes, 2))
print(datetime.datetime.now())

[32m[I 2021-06-28 19:03:10,211][0m A new study created in memory with name: no-name-bcfefda0-7d40-46fa-b594-5dae9bb5bd6b[0m
Searching the best hyperparameters...


2021-06-28 19:03:10.200997


[32m[I 2021-06-28 19:03:15,928][0m Trial 0 finished with value: 1.7191255592617989 and parameters: {'feature_fraction': 0.9500000000000001, 'max_depth': 1, 'num_leaves': 2, 'min_data_in_leaf': 2948, 'lambda_l1': 0.6992236582314836, 'lambda_l2': 2.0628076982625547, 'bagging_fraction': 0.65, 'bagging_freq': 9}. Best is trial 0 with value: 1.7191255592617989.[0m
[32m[I 2021-06-28 19:03:33,387][0m Trial 1 finished with value: 1.710119295918601 and parameters: {'feature_fraction': 0.35, 'max_depth': 3, 'num_leaves': 6, 'min_data_in_leaf': 4176, 'lambda_l1': 0.5199111594413819, 'lambda_l2': 0.0002183843234858541, 'bagging_fraction': 0.75, 'bagging_freq': 8}. Best is trial 1 with value: 1.710119295918601.[0m
[32m[I 2021-06-28 19:03:49,788][0m Trial 2 finished with value: 1.7127100065047092 and parameters: {'feature_fraction': 0.4, 'max_depth': 4, 'num_leaves': 13, 'min_data_in_leaf': 225, 'lambda_l1': 0.24503189493477853, 'lambda_l2': 4.115079970491543e-06, 'bagging_fraction': 0.8, 'b

[32m[I 2021-06-28 19:09:14,819][0m Trial 23 finished with value: 1.7088802677478085 and parameters: {'feature_fraction': 0.2, 'max_depth': 4, 'num_leaves': 16, 'min_data_in_leaf': 2304, 'lambda_l1': 3.361527183551423e-08, 'lambda_l2': 0.029002761517840505, 'bagging_fraction': 0.95, 'bagging_freq': 2}. Best is trial 14 with value: 1.7081817088883144.[0m
[32m[I 2021-06-28 19:09:30,454][0m Trial 24 finished with value: 1.7088390860400398 and parameters: {'feature_fraction': 0.2, 'max_depth': 6, 'num_leaves': 39, 'min_data_in_leaf': 1349, 'lambda_l1': 7.722032537973647e-09, 'lambda_l2': 0.0009590887125615376, 'bagging_fraction': 0.9, 'bagging_freq': 3}. Best is trial 14 with value: 1.7081817088883144.[0m
[32m[I 2021-06-28 19:09:48,456][0m Trial 25 finished with value: 1.70999166894615 and parameters: {'feature_fraction': 0.45000000000000007, 'max_depth': 5, 'num_leaves': 32, 'min_data_in_leaf': 1625, 'lambda_l1': 8.801205929148821e-08, 'lambda_l2': 3.3801462901106745e-05, 'bagging_

10.47
2021-06-28 19:13:38.357858


In [12]:
from sklearn.metrics import log_loss
test_preds = lgbm_no_outliers.predict_proba(X_test)
print(abs(log_loss(y_test, test_preds)))

1.7496646180610862
