## Loading libraries

In [None]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc

import lightgbm as lgb

from scipy.stats import uniform
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn import metrics
from sklearn.metrics import roc_auc_score, plot_roc_curve

In [None]:
INT8_MIN = np.iinfo(np.int8).min
INT8_MAX = np.iinfo(np.int8).max
INT16_MIN = np.iinfo(np.int16).min
INT16_MAX = np.iinfo(np.int16).max
INT32_MIN = np.iinfo(np.int32).min
INT32_MAX = np.iinfo(np.int32).max

FLOAT16_MIN = np.finfo(np.float16).min
FLOAT16_MAX = np.finfo(np.float16).max
FLOAT32_MIN = np.finfo(np.float32).min
FLOAT32_MAX = np.finfo(np.float32).max


def memory_usage(data, detail = 1):
    if detail:
        display(data.memory_usage())
    memory = data.memory_usage().sum() / (1024 * 1024)
    print('Memory usage : {0:.2f}MB'.format(memory))
    return memory


def compress_dataset(data):
    memory_before_compress = memory_usage(data, 0)
    print()
    print('=' * 50)
    for col in data.columns:
        col_dtype = data[col][:100].dtype

        if col_dtype != 'object':
            print('Name: {0:24s} Type: {1}'.format(col, col_dtype))
            col_series = data[col]
            col_min = col_series.min()
            col_max = col_series.max()

            if col_dtype == 'float64':
                print(' variable min: {0:15s} max: {1:15s}'.format(str(np.round(col_min, 4)), str(np.round(col_max, 4))))
                if (col_min > FLOAT16_MIN) and (col_max < FLOAT16_MAX):
                    data[col] = data[col].astype(np.float16)
                    print('  float16 min: {0:15s} max: {1:15s}'.format(str(FLOAT16_MIN), str(FLOAT16_MAX)))
                    print('compress float64 --> float16')
                elif (col_min > FLOAT32_MIN) and (col_max < FLOAT32_MAX):
                    data[col] = data[col].astype(np.float32)
                    print('  float32 min: {0:15s} max: {1:15s}'.format(str(FLOAT32_MIN), str(FLOAT32_MAX)))
                    print('compress float64 --> float32')
                else:
                    pass
                memory_after_compress = memory_usage(data, 0)
                print('Compress Rate: [{0:.2%}]'.format((memory_before_compress-memory_after_compress) / memory_before_compress))
                print('=' * 50)

            if col_dtype == 'int64':
                print(' variable min: {0:15s} max: {1:15s}'.format(str(col_min), str(col_max)))
                type_flag = 64
                if (col_min > INT8_MIN / 2) and (col_max < INT8_MAX / 2):
                    type_flag = 8
                    data[col] = data[col].astype(np.int8)
                    print('     int8 min: {0:15s} max: {1:15s}'.format(str(INT8_MIN), str(INT8_MAX)))
                elif (col_min > INT16_MIN) and (col_max < INT16_MAX):
                    type_flag = 16
                    data[col] = data[col].astype(np.int16)
                    print('    int16 min: {0:15s} max: {1:15s}'.format(str(INT16_MIN), str(INT16_MAX)))
                elif (col_min > INT32_MIN) and (col_max < INT32_MAX):
                    type_flag = 32
                    data[col] = data[col].astype(np.int32)
                    print('    int32 min: {0:15s} max: {1:15s}'.format(str(INT32_MIN), str(INT32_MAX)))
                    type_flag = 1
                else:
                    pass
                memory_after_compress = memory_usage(data, 0)
                print('Compress Rate: [{0:.2%}]'.format((memory_before_compress-memory_after_compress) / memory_before_compress))
                if type_flag == 32:
                    print('compress (int64) ==> (int32)')
                elif type_flag == 16:
                    print('compress (int64) ==> (int16)')
                else:
                    print('compress (int64) ==> (int8)')
                print('=' * 50)

    print()
    memory_after_compress = memory_usage(data, 0)
    print('Compress Rate: [{0:.2%}]'.format((memory_before_compress-memory_after_compress) / memory_before_compress))
    
    return data

## Loading previously prepared datasets

**Click to check my notebook and see how I prepared datasets ;)** 
https://www.kaggle.com/christoforum/preparing-datasets/notebook

In [None]:
df_train = pd.read_csv('../input/preparing-datasets/train_prepared.csv')
df_train.head()

In [None]:
df_train = df_train.drop('Unnamed: 0', axis = 1)

In [None]:
df_train.info()

In [None]:
df_test = pd.read_csv('../input/preparing-datasets/test_prepared.csv')
df_test.head()

In [None]:
df_test = df_test.drop('Unnamed: 0', axis = 1)

In [None]:
df_test.info()

## Releasing memory

In [None]:
df_train = compress_dataset(df_train)

In [None]:
df_test = compress_dataset(df_test)

## LightGBM + RandomizedSearchCV

In [None]:
feats = df_test.columns

X = df_train[feats]
y = df_train['target']

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.3, 
                                                                random_state = 10, stratify = y)

In [None]:
%%time

fit_params = dict(early_stopping_rounds = 200, eval_set = [(X_train, y_train), (X_validation, y_validation)], 
                  eval_metric = 'auc', verbose = 200)

rs_params = dict(learning_rate = uniform(loc = 0.05, scale = 0.1), reg_lambda = [0, 20],
                 n_estimators = [5000], max_depth = [3, 5], num_leaves = [5, 7], subsample = [0.5, 0.6],
                 colsample_bytree = [0.3, 0.5], reg_alpha = [0, 20])


lgb = lgb.LGBMClassifier(random_state = 8, device = 'gpu')
rs_lgb = RandomizedSearchCV(estimator = lgb, param_distributions = rs_params, scoring = 'roc_auc', 
                            cv = 5, n_iter = 15, random_state = 12)
rs_lgb.fit(X_train, y_train, **fit_params)
preds = rs_lgb.predict_proba(X_validation)[:, -1]

In [None]:
rs_lgb.best_params_

In [None]:
model = rs_lgb.best_estimator_
model

In [None]:
plot_roc_curve(model, X_validation, y_validation)
plt.plot([0, 1], '--y')
plt.grid()

In [None]:
def check_model(model, n_splits = 5):
    
    scores= []
    cv = KFold(n_splits = n_splits, shuffle = True)
    
    for train_idx, test_idx in cv.split(X):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
        model.fit(X_train, y_train, early_stopping_rounds = 200, verbose = 200, 
                  eval_set = [(X_train, y_train), (X_test, y_test)], eval_metric = 'auc')
        preds = model.predict_proba(X_test)[:, -1]
        score = roc_auc_score(y_test, preds)
        scores.append(score)
        
    print('************************************')    
    print(f'Mean AUCROC score:       {np.mean(scores)}')
    print(f'Std AUCROC:              {np.std(scores)}')

In [None]:
%%time

check_model(model)

In [None]:
preds = model.predict_proba(df_test)[:, -1]

## Submission

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')
sub['target'] = preds
sub.head()

In [None]:
sub.to_csv('lgbm_ver2.csv', index = False)