# Setup

In [None]:
import time
import numpy as np
import pandas as pd
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
RS = 335577

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Load and explore Train / Test Data

In [None]:
data_path = '/kaggle/input/tabular-playground-series-may-2022/'
# data_path = 'data/'

df_train_file = pd.read_csv(data_path + 'train.csv', index_col='id')
df_test_file = pd.read_csv(data_path + 'test.csv', index_col='id')

In [None]:
df_train_file.head(3)
df_train_file.shape

df_test_file.head(3)
df_test_file.shape

# Feature Engineering Functions

In [None]:
# from previously shared notebooks by other Kagglers
def all_feature_eng(df):
    for i in range(10):
        df[f'ch{i}'] = df.f_27.str.get(i).apply(ord) - ord('A')
        
    df["unique_characters"] = df.f_27.apply(lambda s: len(set(s)))
    
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)
    return df.drop(['f_27'], axis=1)

## Training the LGBM model on only 5% data !!!

In [None]:
df_train = df_train_file.sample(frac=0.05, random_state=RS).copy()
df_val = df_train_file.loc[~df_train_file.index.isin(df_train.index)].copy()

df_train_fe = all_feature_eng(df_train)
df_val_fe = all_feature_eng(df_val)

df_train_fe.shape
df_val_fe.shape

In [None]:
X_train = df_train_fe.drop('target', axis=1)
y_train = df_train_fe.target

X_test = df_val_fe.drop('target', axis=1)
y_test = df_val_fe.target

model = LGBMClassifier(random_state=RS, n_estimators=500)
_ = model.fit(X_train, y_train)

y_pred_prob = model.predict_proba(X_test)[:, 1]

'auc score, trained on 5% training data, validated on 95% training data'
roc_auc_score(y_test, y_pred_prob)

## Training the LGBM model on only 1% data !!!

In [None]:
df_train = df_train_file.sample(frac=0.01, random_state=RS).copy()
df_val = df_train_file.loc[~df_train_file.index.isin(df_train.index)].copy()

df_train_fe = all_feature_eng(df_train)
df_val_fe = all_feature_eng(df_val)

df_train_fe.shape
df_val_fe.shape

In [None]:
X_train = df_train_fe.drop('target', axis=1)
y_train = df_train_fe.target

X_test = df_val_fe.drop('target', axis=1)
y_test = df_val_fe.target

model = LGBMClassifier(random_state=RS, n_estimators=500)
_ = model.fit(X_train, y_train)

y_pred_prob = model.predict_proba(X_test)[:, 1]

'auc score, trained on 1% training data (9000 rows), validated on 99% training data'
roc_auc_score(y_test, y_pred_prob)

# Predict and submit

In [None]:
X_submit = all_feature_eng(df_test_file)
X_submit.shape

In [None]:
y_pred_prob = model.predict_proba(X_submit)[:, 1]

In [None]:
df_test_file['target'] = y_pred_prob

In [None]:
df_test_file['target'].to_csv('submission.csv')