In [None]:
import numpy as np              # linear algebra
import pandas as pd             # data processing, CSV file I/O (e.g. pd.read_csv)
                                
import matplotlib.pyplot as plt # data visualization
import seaborn as sns           # data visualization
                                
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

import optuna
import lightgbm as lgb
from sklearn.metrics import auc, roc_curve

# Data Preparation

---

## Data Extraction

In [None]:
sample_submission = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv")
train = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/test.csv")

In [None]:
sample_submission.shape

In [None]:
train.shape

In [None]:
train.info()

In [None]:
test.shape

In [None]:
test.info()

## Data Concatenation

In [None]:
data = pd.concat([train, test], sort = False)
data.shape

In [None]:
data.info()

In [None]:
data.head()

## Null Check

In [None]:
null_cols = [col for col in data.iloc[: , : -1].columns if data[col].isnull().sum() != 0]
null_cols

In [None]:
del data

# Data Visualization

---

## Features Distribution

In [None]:
float_cols = [col for col in train.iloc[: , 1 : -1].columns if train[col].dtype == "float64"]
len(float_cols)

In [None]:
CHUNMEIHONG = '#f1939c'
QIUBOLAN = '#8abcd1'
XIANGYABAI = '#fffef8'
ZHENZHUHUI = '#e4dfd7'

fig, axes = plt.subplots(20, 5, figsize = (16, 48))
axes = axes.flatten()

def features_distribution(axes):
    for idx, ax in enumerate(axes):
        sns.kdeplot(
            data = train[float_cols + ['target']],
            ax = ax,
            hue = 'target',
            fill = True,
            x = f'f{idx}',
            palette = [f'{CHUNMEIHONG}', f'{QIUBOLAN}'],
            legend = idx == 0,
            alpha = .5,
            linewidth = 2.5,
        )
        
        ax.grid(
            color = XIANGYABAI,
            linestyle = ":",
            linewidth = 1.25,
            alpha = 0.3,
        )
        ax.set_facecolor(ZHENZHUHUI)
        #ax.set_xticks([])
        #ax.set_yticks([])
        ax.spines['left'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.yaxis.tick_right()
        ax.yaxis.set_label_position("left")
        ax.set_title(
            f'f{idx}',
            loc = 'right',
            weight = 'bold',
            fontsize = 10,
        )
        #ax.set_xticks([])
        #ax.set_yticks([])
        ax.set_xlabel('')
        #ax.set_ylabel('')
        if idx % 5 != 0:
            ax.set_ylabel('')

features_distribution(axes)

fig.supxlabel('Probability', ha = 'center', fontweight = 'bold', fontsize = 16, y = -0.005,)
fig.supylabel('Density', ha = 'center', fontweight = 'bold', fontsize = 16, x = -0.005,)
fig.suptitle('Features Distribution', ha = 'center', fontweight = 'heavy', fontsize = 20, y = 1,)
fig.tight_layout()

## Feature Engineering

In [None]:
df_train = train.copy()
df_test = test.copy()

peaks = ['f0','f2','f4','f9','f12','f16','f19','f20','f23','f24','f27',
    'f28','f30','f31','f32','f33','f35','f39','f42','f44','f46','f48',
    'f49','f51','f52','f53','f56','f58','f59','f60','f61','f62','f63',
    'f64','f68','f69','f72','f73','f75','f76','f78','f79','f81','f83',
    'f84','f87','f88','f89','f90','f92','f93','f94','f95','f98','f99']

no_peaks = [feats for feats in df_test.columns if feats not in peaks]

df_train['median_peaks'] = df_train[peaks].median(axis = 1)
df_train['median_no_peaks'] = df_train[no_peaks].median(axis = 1)
df_test['median_peaks'] = df_test[peaks].median(axis = 1)
df_test['median_no_peaks'] = df_test[no_peaks].median(axis = 1)

df_train['mean_peaks'] = df_train[peaks].mean(axis = 1)
df_train['mean_no_peaks'] = df_train[no_peaks].mean(axis = 1)
df_test['mean_peaks'] = df_test[peaks].mean(axis = 1)
df_test['mean_no_peaks'] = df_test[no_peaks].mean(axis = 1)

df_train['std_peaks'] = df_train[peaks].std(axis = 1)
df_train['std_no_peaks'] = df_train[no_peaks].std(axis = 1)
df_test['std_peaks'] = df_test[peaks].std(axis = 1)
df_test['std_no_peaks'] = df_test[no_peaks].std(axis = 1)

df_train['sum_peaks'] = df_train[peaks].sum(axis = 1)
df_train['sum_no_peaks'] = df_train[no_peaks].sum(axis = 1)
df_test['sum_peaks'] = df_test[peaks].sum(axis = 1)
df_test['sum_no_peaks'] = df_test[no_peaks].sum(axis = 1)

df_train['min_peaks'] = df_train[peaks].min(axis = 1)
df_train['min_no_peaks'] = df_train[no_peaks].min(axis = 1)
df_test['min_peaks'] = df_test[peaks].min(axis = 1)
df_test['min_no_peaks'] = df_test[no_peaks].min(axis = 1)

df_train['max_peaks'] = df_train[peaks].max(axis = 1)
df_train['max_no_peaks'] = df_train[no_peaks].max(axis = 1)
df_test['max_peaks'] = df_test[peaks].max(axis = 1)
df_test['max_no_peaks'] = df_test[no_peaks].max(axis = 1)

df_train['skew_peaks'] = df_train[peaks].skew(axis = 1)
df_train['skew_no_peaks'] = df_train[no_peaks].skew(axis = 1)
df_test['skew_peaks'] = df_test[peaks].skew(axis = 1)
df_test['skew_no_peaks'] = df_test[no_peaks].skew(axis = 1)

In [None]:
scaler = StandardScaler()

float_columns = [feats for feats in df_train.select_dtypes('float')]

df_train[float_columns] = scaler.fit_transform(df_train[float_columns])
df_train = df_train.drop('id', axis = 1)
df_test = pd.DataFrame(scaler.transform(df_test[float_columns]), columns = df_test[float_columns].columns)

In [None]:
X = df_train.copy()
y = X.pop('target')
X_test = df_test.copy()

del train, test

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3,
                                                      random_state =0, stratify = y)

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference = lgb_train)
    

def objective(trial):
    params = {
        'metric': 'auc',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'feature_pre_filter': False,
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.02),
        'max_bin': trial.suggest_int('max_bin', 64, 255),
        'num_leaves': trial.suggest_int('num_leaves', 8, 32),
        'device': 'gpu',
    }
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference = lgb_train)
    
    model = lgb.train(params,
                      lgb_train,
                      valid_sets = [lgb_train, lgb_eval],
                      verbose_eval = 10,
                      num_boost_round = 1000,
                      early_stopping_rounds = 10)
    
    y_pred_valid = model.predict(X_valid, num_iteration = model.best_iteration)
    
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_valid, y_pred_valid)
    score = auc(false_positive_rate, true_positive_rate)
    return score

study = optuna.create_study(direction = 'maximize', sampler = optuna.samplers.RandomSampler(seed = 0))
study.optimize(objective, n_trials = 30)

In [None]:
study.best_params

In [None]:
params = {
    'metric': 'auc',
    'lambda_l1': study.best_params['lambda_l1'],
    'lambda_l2': study.best_params['lambda_l2'],
    'feature_fraction': study.best_params['feature_fraction'],
    'bagging_fraction': study.best_params['bagging_fraction'],
    'bagging_freq': study.best_params['bagging_freq'],
    'min_child_samples': study.best_params['min_child_samples'],
    'feature_pre_filter': False,
    'learning_rate': study.best_params['learning_rate'],
    'num_leaves': study.best_params['num_leaves'],
    'max_bin': study.best_params['max_bin'],
    'device': 'gpu',
}


lgb_train = lgb.Dataset(X_train, y_train,)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference = lgb_train,)

model = lgb.train(params,
                  lgb_train,
                  valid_sets = [lgb_train, lgb_eval],
                  verbose_eval = 10,
                  num_boost_round = 1000,
                  early_stopping_rounds = 10)


y_pred = model.predict(X_test, num_iteration = model.best_iteration)

In [None]:
sub = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv")
sub['target'] = y_pred
sub.to_csv('submission_1st_trial.csv', index = False)

In [None]:
sub.head()