In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import pandas_profiling as pp
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from boruta import BorutaPy as boruta

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv(r'../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv(r'../input/tabular-playground-series-mar-2021/test.csv')

In [None]:
train['target'].value_counts(1)

## Modelling

In [None]:
cat_col_nc = ['cat0','cat1','cat2','cat3','cat4','cat11','cat12','cat13','cat14','cat15','cat16','cat17','cat18']
cat_col_hc = ['cat5','cat6','cat8','cat7','cat9','cat10']

In [None]:
def high_cardinal_transformer(cols, df):
    for i in cols:
        le = LabelEncoder()
        df[i] = le.fit_transform(df[i])
        sc = StandardScaler()
        df[[i]] = sc.fit_transform(df[[i]])

In [None]:
high_cardinal_transformer(cat_col_hc, train)
train.head()

In [None]:
def low_cardinal_transformer(cols, df):
    for i in cols:
        dummies = pd.get_dummies(df[i])
        col_list = [i+'_'+x for x in dummies.columns]
        dummies.columns = col_list
        df = pd.concat([df, dummies], axis=1)
        df.drop([i], axis=1, inplace=True)
    return df

In [None]:
train = low_cardinal_transformer(cat_col_nc, train)
train.head()

In [None]:
def numerical_feat(df):
    df['rat_1'] = 1/(df['cont0']+df['cont1']**2)
    df['rat_2'] = 1/(df['cont2']+df['cont3']**2)
    df['rat_3'] = 1/(df['cont4']+df['cont5']**2)
    df['rat_4'] = 1/(df['cont6']+df['cont7']**2)
    df['rat_5'] = 1/(df['cont8']+df['cont9']**2)
    df['rat_6'] = 1/(df['cont10']+df['cont7']**2)
    
    df['poly_b_0'] = df['cont0']/df['cont0']**2
    df['poly_b_1'] = df['cont1']/df['cont1']**2
    df['poly_b_2'] = df['cont2']/df['cont2']**2
    df['poly_b_3'] = df['cont3']/df['cont3']**2
    df['poly_b_4'] = df['cont4']/df['cont4']**2
    df['poly_b_5'] = df['cont5']/df['cont5']**2
    df['poly_b_6'] = df['cont6']/df['cont6']**2
    df['poly_b_7'] = df['cont7']/df['cont7']**2
    df['poly_b_8'] = df['cont8']/df['cont8']**2
    df['poly_b_9'] = df['cont9']/df['cont9']**2
    df['poly_b_10'] = df['cont10']/df['cont10']**2
    
    return df

In [None]:
train = numerical_feat(train)
train.head()

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train.drop(['id', 'target'], axis=1), train['target'], random_state=42)

In [None]:
lgb = LGBMClassifier()
lgb.fit(x_train, y_train)

val_pred = lgb.predict_proba(x_val)
roc_auc_score(y_val, val_pred[:,1])

In [None]:
xtrain = train.drop(['id','target'], axis=1)
ytrain = train['target']

In [None]:
X = x_train
Y = y_train
BEST_PARAMS = {
    'n_estimators': 10000,
    'learning_rate': 0.05,
    'metric': 'auc',
    'colsample_bytree': 0.235,
    'max_depth': 25,
    'boosting_type': 'gbdt'
}

In [None]:
def objective(trial):
    _params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 15000),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6]),
        'subsample': trial.suggest_uniform('subsample', 0,1),
        'learning_rate': trial.suggest_uniform('learning_rate', 0, 0.1 ),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'metric': 'auc',
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100, 10, False),
        'max_bin': trial.suggest_int('max_bin', 10, 300, 10, False),
        'sub_feature': trial.suggest_uniform('sub_feature', 0.0, 1.0),
    }
    
    kf = StratifiedKFold(n_splits=5)
    roc_test = []
    for train_index, test_index in kf.split(xtrain, ytrain):
        x_train_fold, x_test_fold = xtrain.loc[train_index], xtrain.loc[test_index]
        y_train_fold, y_test_fold = ytrain.loc[train_index], ytrain.loc[test_index]
        lgb = LGBMClassifier(**_params)
        lgb.fit(x_train_fold, y_train_fold, eval_set=(x_test_fold, y_test_fold))    
        proba = lgb.predict_proba(x_test_fold)[:,1]
        roc_test.append(roc_auc_score(y_test_fold, proba))
    print(np.mean(roc_test))
    return np.mean(roc_test)

In [None]:
study = optuna.create_study()
study.optimize(objective, timeout=3600*1, n_jobs=-1)

In [None]:
# Best parameters
BEST_PARAMS.update(study.best_params)
BEST_PARAMS

'n_estimators': 10307,
 'learning_rate': 0.033547705256212215,
 'metric': 'auc',
 'colsample_bytree': 0.3,
 'max_depth': 10,
 'boosting_type': 'gbdt',
 'reg_alpha': 0.023390124696592733,
 'reg_lambda': 5.2053154587570864e-05,
 'subsample': 0.10734476335915966,
 'num_leaves': 626,
 'min_child_samples': 264,
 'min_data_in_leaf': 100,
 'max_bin': 60,
 'sub_feature': 0.3527999518536946}

In [None]:
final_params = {'n_estimators': 10307, 'learning_rate': 0.033547705256212215, 'metric': 'auc', 'colsample_bytree': 0.3, 'max_depth': 10, 'boosting_type': 'gbdt', 'reg_alpha': 0.023390124696592733, 'reg_lambda': 5.2053154587570864e-05, 'subsample': 0.10734476335915966, 'num_leaves': 626, 'min_child_samples': 264, 'min_data_in_leaf': 100, 'max_bin': 60, 'sub_feature': 0.3527999518536946}

In [None]:
lg = LGBMClassifier(**BEST_PARAMS)
lg.fit(x_train, y_train)

val_pred = lg.predict_proba(x_val)
roc_auc_score(y_val, val_pred[:,1])

## Generating Predictions

In [None]:
high_cardinal_transformer(cat_col_hc, test)
test = low_cardinal_transformer(cat_col_nc, test)
test = numerical_feat(test)
test.head()

In [None]:
idx = test['id']
test_pred = lg.predict_proba(test.drop(['id'], axis=1))

In [None]:
df = pd.DataFrame(columns=['id', 'target'])
df['id'] = idx.values
df['target'] = test_pred[:,1]
df.shape

In [None]:
df.head()

In [None]:
df.to_csv('submission3.csv', index=False)