# Read Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
import random

SEED=44
random.seed(SEED)
np.random.seed(SEED)
pd.set_option('display.width', None)
pd.set_option('display.max_column', None)

warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')

In [None]:
train.info()

In [None]:
y = train.iloc[:,-1]
X = train.iloc[:,:-1]
Z = test

# Categorical

In [None]:
def get_obj_cols(df):
    return [col for col in df.columns if df.dtypes[col] == np.object]

In [None]:
X_objs = get_obj_cols(X)
X_objs_idx = [X.columns.get_loc(col) for col in X_objs]
Z_objs = get_obj_cols(Z)
Z_objs_idx = [Z.columns.get_loc(col) for col in Z_objs]

In [None]:
for obj in X_objs:
    X[obj] = X[obj].astype('category').cat.codes
for obj in Z_objs:
    Z[obj] = Z[obj].astype('category').cat.codes

In [None]:
X.drop('id', axis=1, inplace=True)
Z.drop('id', axis=1, inplace=True)

# Training

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
K = 10

In [None]:
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=SEED)

In [None]:
df_model = pd.DataFrame(columns = [*[f'model_{i}' for i in range(K)], 'average_auc'])
df_model

In [None]:
import lightgbm as lgbm

In [None]:
params = {
    'learning_rate':1e-2,
    'max_bin':24,
    'n_estimators':1000,
    'device_type':'gpu',
    'metric':'auc',
    'extra_trees':True,
    'seed':SEED,
    'boosting_type':'goss',
    'num_leaves':128,
    'is_unbalance':True,
    'categorical_feature':X_objs_idx
}

In [None]:
ctr = 0
model = []
auc_score = []
for train_idx, val_idx in skf.split(X, y):
    Xt, Xv = X.loc[train_idx], X.loc[val_idx]
    yt, yv = y.loc[train_idx], y.loc[val_idx]
    
    model.append(
        lgbm.LGBMClassifier(**params)
    )
    
    model[ctr].fit(Xt, yt)
    yv_prob = model[ctr].predict_proba(Xv)[:,1]
    auc_score.append(roc_auc_score(yv, yv_prob))
    ctr+=1

In [None]:
df_model.loc[df_model.shape[0]] = [
    *model,
    np.mean(auc_score)
]

In [None]:
df_model['average_auc']

In [None]:
def predict(X):
    y_prob = []
    for i in range(K):
        y_prob.append(df_model.loc[0, f'model_{i}'].predict_proba(X)[:,1])
    return np.mean(np.array(y_prob), axis=0)

def metrics(y_true, y_prob):
    print('AUC Score : ', roc_auc_score(y_true, y_prob))

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [None]:
yv_prob = predict(Xv)
metrics(yv, yv_prob)

In [None]:
test_prob = predict(Z)
submission = pd.read_csv("../input/tabular-playground-series-mar-2021/sample_submission.csv")

submission.target = test_prob
submission.to_csv("sample_subs.csv", index=False)