In [None]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc

import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn import metrics
from sklearn.metrics import roc_auc_score, plot_roc_curve

In [None]:
INT8_MIN = np.iinfo(np.int8).min
INT8_MAX = np.iinfo(np.int8).max
INT16_MIN = np.iinfo(np.int16).min
INT16_MAX = np.iinfo(np.int16).max
INT32_MIN = np.iinfo(np.int32).min
INT32_MAX = np.iinfo(np.int32).max

FLOAT16_MIN = np.finfo(np.float16).min
FLOAT16_MAX = np.finfo(np.float16).max
FLOAT32_MIN = np.finfo(np.float32).min
FLOAT32_MAX = np.finfo(np.float32).max


def memory_usage(data, detail = 1):
    if detail:
        display(data.memory_usage())
    memory = data.memory_usage().sum() / (1024 * 1024)
    print("Memory usage : {0:.2f}MB".format(memory))
    return memory


def compress_dataset(data):
    memory_before_compress = memory_usage(data, 0)
    print()
    print('=' * 50)
    for col in data.columns:
        col_dtype = data[col][:100].dtype

        if col_dtype != 'object':
            print("Name: {0:24s} Type: {1}".format(col, col_dtype))
            col_series = data[col]
            col_min = col_series.min()
            col_max = col_series.max()

            if col_dtype == 'float64':
                print(" variable min: {0:15s} max: {1:15s}".format(str(np.round(col_min, 4)), str(np.round(col_max, 4))))
                if (col_min > FLOAT16_MIN) and (col_max < FLOAT16_MAX):
                    data[col] = data[col].astype(np.float16)
                    print("  float16 min: {0:15s} max: {1:15s}".format(str(FLOAT16_MIN), str(FLOAT16_MAX)))
                    print("compress float64 --> float16")
                elif (col_min > FLOAT32_MIN) and (col_max < FLOAT32_MAX):
                    data[col] = data[col].astype(np.float32)
                    print("  float32 min: {0:15s} max: {1:15s}".format(str(FLOAT32_MIN), str(FLOAT32_MAX)))
                    print("compress float64 --> float32")
                else:
                    pass
                memory_after_compress = memory_usage(data, 0)
                print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
                print('=' * 50)

            if col_dtype == 'int64':
                print(" variable min: {0:15s} max: {1:15s}".format(str(col_min), str(col_max)))
                type_flag = 64
                if (col_min > INT8_MIN / 2) and (col_max < INT8_MAX / 2):
                    type_flag = 8
                    data[col] = data[col].astype(np.int8)
                    print("     int8 min: {0:15s} max: {1:15s}".format(str(INT8_MIN), str(INT8_MAX)))
                elif (col_min > INT16_MIN) and (col_max < INT16_MAX):
                    type_flag = 16
                    data[col] = data[col].astype(np.int16)
                    print("    int16 min: {0:15s} max: {1:15s}".format(str(INT16_MIN), str(INT16_MAX)))
                elif (col_min > INT32_MIN) and (col_max < INT32_MAX):
                    type_flag = 32
                    data[col] = data[col].astype(np.int32)
                    print("    int32 min: {0:15s} max: {1:15s}".format(str(INT32_MIN), str(INT32_MAX)))
                    type_flag = 1
                else:
                    pass
                memory_after_compress = memory_usage(data, 0)
                print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
                if type_flag == 32:
                    print("compress (int64) ==> (int32)")
                elif type_flag == 16:
                    print("compress (int64) ==> (int16)")
                else:
                    print("compress (int64) ==> (int8)")
                print('=' * 50)

    print()
    memory_after_compress = memory_usage(data, 0)
    print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
    
    return data

## Train set summary

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
df_train.head()

In [None]:
df_train = df_train.drop('id', axis = 1)

In [None]:
print(f'Train set shape:   {df_train.shape}')

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.isnull().sum().max() == 0

## Test set summary

In [None]:
df_test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
df_test.head()

In [None]:
df_test = df_test.drop('id', axis = 1)

In [None]:
print(f'Test set shape:   {df_test.shape}')

In [None]:
df_test.info()

In [None]:
df_test.describe()

In [None]:
df_test.isnull().sum().max() == 0

## Target summary

In [None]:
plt.figure(figsize = (5,5))
plt.pie(x = df_train['target'].value_counts(), labels = ['1', '0'], autopct = '%1.2f%%', 
        explode = [0.05, 0], startangle = 90)

**Summary:**
1. Train set contains **600 000** rows and **102** columns (including `id`)
2. Test set contains **540 000** rows and **101** columns (including `id`)
3. All columns are 'float' type (except `target`)
4. There are **no missing values** in train set and test set.
5. Classes in target column are **balansed**.

## Feature engineering

In [None]:
df_train_sample = df_train.sample(n = 20000)
df_test_sample = df_test.sample(n = 20000)

In [None]:
fig, axes = plt.subplots(10,10, figsize = (30, 30))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    
    sns.kdeplot(data = df_train_sample, ax = ax, fill = True, x = f'f{idx}', 
                palette = ['#4DB6AC', 'red'])
    sns.kdeplot(data = df_test_sample, ax = ax, fill = True, x = f'f{idx}', 
                palette = ['#4DB6AC', 'blue'])
 
    ax.set_xticks([]); ax.set_yticks([]); ax.set_xlabel('')
    ax.set_ylabel(''); ax.spines['left'].set_visible(False)
    ax.set_title(f'f{idx}', loc = 'right', weight = 'bold', fontsize = 10)

fig.supxlabel('Probability Density Function Estimation', ha = 'center', fontweight = 'bold')
fig.tight_layout()
plt.show()

In [None]:
peaks = ['f0','f2','f4','f9','f12','f16','f19','f20','f23','f24','f27',
    'f28','f30','f31','f32','f33','f35','f39','f42','f44','f46','f48',
    'f49','f51','f52','f53','f56','f58','f59','f60','f61','f62','f63',
    'f64','f68','f69','f72','f73','f75','f76','f78','f79','f81','f83',
    'f84','f87','f88','f89','f90','f92','f93','f94','f95','f98','f99']

no_peaks = [feats for feats in df_test.columns if feats not in peaks]

df_train['median_peaks'] = df_train[peaks].median(axis = 1)
df_train['median_no_peaks'] = df_train[no_peaks].median(axis = 1)
df_test['median_peaks'] = df_test[peaks].median(axis = 1)
df_test['median_no_peaks'] = df_test[no_peaks].median(axis = 1)

df_train['mean_peaks'] = df_train[peaks].mean(axis = 1)
df_train['mean_no_peaks'] = df_train[no_peaks].mean(axis = 1)
df_test['mean_peaks'] = df_test[peaks].mean(axis = 1)
df_test['mean_no_peaks'] = df_test[no_peaks].mean(axis = 1)

df_train['std_peaks'] = df_train[peaks].std(axis = 1)
df_train['std_no_peaks'] = df_train[no_peaks].std(axis = 1)
df_test['std_peaks'] = df_test[peaks].std(axis = 1)
df_test['std_no_peaks'] = df_test[no_peaks].std(axis = 1)

df_train['sum_peaks'] = df_train[peaks].sum(axis = 1)
df_train['sum_no_peaks'] = df_train[no_peaks].sum(axis = 1)
df_test['sum_peaks'] = df_test[peaks].sum(axis = 1)
df_test['sum_no_peaks'] = df_test[no_peaks].sum(axis = 1)

df_train['min_peaks'] = df_train[peaks].min(axis = 1)
df_train['min_no_peaks'] = df_train[no_peaks].min(axis = 1)
df_test['min_peaks'] = df_test[peaks].min(axis = 1)
df_test['min_no_peaks'] = df_test[no_peaks].min(axis = 1)

df_train['max_peaks'] = df_train[peaks].max(axis = 1)
df_train['max_no_peaks'] = df_train[no_peaks].max(axis = 1)
df_test['max_peaks'] = df_test[peaks].max(axis = 1)
df_test['max_no_peaks'] = df_test[no_peaks].max(axis = 1)

df_train['skew_peaks'] = df_train[peaks].skew(axis = 1)
df_train['skew_no_peaks'] = df_train[no_peaks].skew(axis = 1)
df_test['skew_peaks'] = df_test[peaks].skew(axis = 1)
df_test['skew_no_peaks'] = df_test[no_peaks].skew(axis = 1)

In [None]:
df_train.head()

In [None]:
df_test.head()

## Standard Scaler

In [None]:
scaler = StandardScaler()

float_columns = [feats for feats in df_train.select_dtypes('float')]

df_train[float_columns] = scaler.fit_transform(df_train[float_columns])
df_test = pd.DataFrame(scaler.transform(df_test), columns = df_test.columns)

In [None]:
df_train.head()

In [None]:
df_test.head()

## Release memory

In [None]:
df_train = compress_dataset(df_train)

In [None]:
df_test = compress_dataset(df_test)

## Optuna

In [None]:
X = df_train.select_dtypes('float16')
y = df_train['target']

In [None]:
def objective(trial):
    params = {'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 0.9, 0.1),
              'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 0.9, 0.1),
              'max_depth': trial.suggest_int("max_depth", 5, 9, 2),
              'reg_alpha': trial.suggest_int("reg_alpha", 0, 60, 20),
              'reg_lambda': trial.suggest_int('reg_lambda', 0, 60, 20),
              'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1)}
        
    model = lgb.LGBMClassifier(**params, random_state = 51, n_estimators = 1000, device = 'gpu', n_jobs = -1)
    
    scores = []
    cv = KFold(n_splits = 5, shuffle = True)
    
    for train_idx, test_idx in cv.split(X):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
        
        model.fit(X_train, y_train, early_stopping_rounds = 200, eval_metric = 'auc', 
                  eval_set = [(X_train, y_train), (X_test, y_test)], verbose = 200)
        preds = model.predict_proba(X_test)[:, 1]
        
        score = roc_auc_score(y_test, preds)
        scores.append(score / cv.n_splits)
    
    return sum(scores)

In [None]:
study = optuna.create_study(direction = 'maximize', sampler = TPESampler())
study.optimize(objective, n_trials = 30)

In [None]:
params = study.best_params
params

In [None]:
model = lgb.LGBMClassifier(**params, n_estimators = 2000, device = 'gpu', n_jobs = -1)

scores = []
preds_tests = []
cv = KFold(n_splits = 10, shuffle = True)

for train_idx, test_idx in cv.split(X):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

    model.fit(X_train, y_train, early_stopping_rounds = 200, eval_metric = 'auc', 
              eval_set = [(X_train, y_train), (X_test, y_test)], verbose = 200)
    
    preds = model.predict_proba(X_test)[:, 1]
    preds_test = model.predict_proba(df_test)[:, 1]
    
    score = roc_auc_score(y_test, preds)
    scores.append(score)
    preds_tests.append(preds_test)

print('************************************')    
print(f"Mean AUROC score:       {np.mean(scores)}")
print(f"Std AUROC:              {np.std(scores)}")

In [None]:
plot_roc_curve(model, X, y)
plt.grid()

In [None]:
lgb.plot_importance(model, figsize = (10, 30))

## Submission

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')
sub['target'] = np.mean(preds_tests, axis = 0)
sub.head()

In [None]:
sub.to_csv('lgbm_0.74631.csv', index = False)