In [None]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc

import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn import metrics
from sklearn.metrics import roc_auc_score, plot_roc_curve

In [None]:
INT8_MIN = np.iinfo(np.int8).min
INT8_MAX = np.iinfo(np.int8).max
INT16_MIN = np.iinfo(np.int16).min
INT16_MAX = np.iinfo(np.int16).max
INT32_MIN = np.iinfo(np.int32).min
INT32_MAX = np.iinfo(np.int32).max

FLOAT16_MIN = np.finfo(np.float16).min
FLOAT16_MAX = np.finfo(np.float16).max
FLOAT32_MIN = np.finfo(np.float32).min
FLOAT32_MAX = np.finfo(np.float32).max


def memory_usage(data, detail = 1):
    if detail:
        display(data.memory_usage())
    memory = data.memory_usage().sum() / (1024 * 1024)
    print("Memory usage : {0:.2f}MB".format(memory))
    return memory


def compress_dataset(data):
    memory_before_compress = memory_usage(data, 0)
    print()
    print('=' * 50)
    for col in data.columns:
        col_dtype = data[col][:100].dtype

        if col_dtype != 'object':
            print("Name: {0:24s} Type: {1}".format(col, col_dtype))
            col_series = data[col]
            col_min = col_series.min()
            col_max = col_series.max()

            if col_dtype == 'float64':
                print(" variable min: {0:15s} max: {1:15s}".format(str(np.round(col_min, 4)), str(np.round(col_max, 4))))
                if (col_min > FLOAT16_MIN) and (col_max < FLOAT16_MAX):
                    data[col] = data[col].astype(np.float16)
                    print("  float16 min: {0:15s} max: {1:15s}".format(str(FLOAT16_MIN), str(FLOAT16_MAX)))
                    print("compress float64 --> float16")
                elif (col_min > FLOAT32_MIN) and (col_max < FLOAT32_MAX):
                    data[col] = data[col].astype(np.float32)
                    print("  float32 min: {0:15s} max: {1:15s}".format(str(FLOAT32_MIN), str(FLOAT32_MAX)))
                    print("compress float64 --> float32")
                else:
                    pass
                memory_after_compress = memory_usage(data, 0)
                print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
                print('=' * 50)

            if col_dtype == 'int64':
                print(" variable min: {0:15s} max: {1:15s}".format(str(col_min), str(col_max)))
                type_flag = 64
                if (col_min > INT8_MIN / 2) and (col_max < INT8_MAX / 2):
                    type_flag = 8
                    data[col] = data[col].astype(np.int8)
                    print("     int8 min: {0:15s} max: {1:15s}".format(str(INT8_MIN), str(INT8_MAX)))
                elif (col_min > INT16_MIN) and (col_max < INT16_MAX):
                    type_flag = 16
                    data[col] = data[col].astype(np.int16)
                    print("    int16 min: {0:15s} max: {1:15s}".format(str(INT16_MIN), str(INT16_MAX)))
                elif (col_min > INT32_MIN) and (col_max < INT32_MAX):
                    type_flag = 32
                    data[col] = data[col].astype(np.int32)
                    print("    int32 min: {0:15s} max: {1:15s}".format(str(INT32_MIN), str(INT32_MAX)))
                    type_flag = 1
                else:
                    pass
                memory_after_compress = memory_usage(data, 0)
                print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
                if type_flag == 32:
                    print("compress (int64) ==> (int32)")
                elif type_flag == 16:
                    print("compress (int64) ==> (int16)")
                else:
                    print("compress (int64) ==> (int8)")
                print('=' * 50)

    print()
    memory_after_compress = memory_usage(data, 0)
    print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
    
    return data

## Train set summary

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
df_train.head()

In [None]:
df_train = df_train.drop('id', axis = 1)

In [None]:
print(f'Train set shape:   {df_train.shape}')

In [None]:
df_train.info()

In [None]:
num_feats_train = [feats for feats in df_train.select_dtypes(include = 'float')]
cat_feats_train = [feats for feats in df_train.select_dtypes(include = 'int')]
print(f'Number of num columns:   {len(num_feats_train)} \nNumber of cat columns:   {len(cat_feats_train)}')

In [None]:
df_train.describe()

In [None]:
df_train.isnull().sum().max() == 0

## Test set summary

In [None]:
df_test = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
df_test.head()

In [None]:
df_test = df_test.drop('id', axis = 1)

In [None]:
print(f'Test set shape:   {df_test.shape}')

In [None]:
df_test.info()

In [None]:
num_feats_test = [feats for feats in df_test.select_dtypes(include = 'float')]
cat_feats_test = [feats for feats in df_test.select_dtypes(include = 'int')]
print(f'Number of num columns:   {len(num_feats_test)} \nNumber of cat columns:   {len(cat_feats_test)}')

In [None]:
df_test.describe()

In [None]:
df_test.isnull().sum().max() == 0

## Target summary

In [None]:
plt.figure(figsize = (5,5))
sns.countplot(x = df_train['target'])
plt.grid()

**Summary:**
1. Train set contains **1 000 000** rows and **287** columns (including `id`)
2. Train set has **240** columns with 'float' type and **46** columns with 'int' type
3. Test set contains **500 000** rows and **286** columns (including `id`)
4. Test set has **240** columns with 'float' type and **45** columns with 'int' type
5. There are **no missing values** in train set and test set.
6. Classes in target column are **balansed**.

## Standard Scaler

In [None]:
scaler = StandardScaler()

df_train[num_feats_train] = scaler.fit_transform(df_train[num_feats_train])
df_test[num_feats_test] = scaler.transform(df_test[num_feats_test])

In [None]:
df_train.head()

In [None]:
df_test.head()

## Feature engineering

In [None]:
df_train['mean'] = df_train[num_feats_train].mean(axis = 1)
df_train['std'] = df_train[num_feats_train].std(axis = 1)
df_train['min'] = df_train[num_feats_train].min(axis = 1)
df_train['max'] = df_train[num_feats_train].max(axis = 1)
df_train['mean_2'] = df_train[cat_feats_train[:-1]].mean(axis = 1)
df_train['std_2'] = df_train[cat_feats_train[:-1]].std(axis = 1)

df_test['mean'] = df_test[num_feats_test].mean(axis = 1)
df_test['std'] = df_test[num_feats_test].std(axis = 1)
df_test['min'] = df_test[num_feats_test].min(axis = 1)
df_test['max'] = df_test[num_feats_test].max(axis = 1)
df_test['mean_2'] = df_test[cat_feats_test[:-1]].mean(axis = 1)
df_test['std_2'] = df_test[cat_feats_test[:-1]].std(axis = 1)

In [None]:
df_train.head()

In [None]:
df_test.head()

## Release memory

In [None]:
df_train = compress_dataset(df_train)

In [None]:
df_test = compress_dataset(df_test)

## Saving

In [None]:
df_train.to_csv('train_prepared.csv')
df_test.to_csv('test_prepared.csv')