# Libraries
---

In [None]:
import datatable as dt

import pandas as pd
import numpy as np
import random
import time
import os
import gc

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score


import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

**Please if anyone sees this Notebook upvote it :) it's free**

# Parameters
---

In [None]:
N_SPLITS = 5
N_ESTIMATORS = 20000
EARLY_STOPPING_ROUNDS = 50
VERBOSE = 1000
SEED = 42

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(SEED)

# Datasets
---

In [None]:
train = dt.fread('../input/tabular-playground-series-oct-2021/train.csv').to_pandas()
test = dt.fread('../input/tabular-playground-series-oct-2021/test.csv').to_pandas()
train = train[train.columns[1:]]
test = test[test.columns[1:]]

TARGET = 'target'
train[TARGET] = train[TARGET].astype('uint8')

In [None]:
features = [col for col in train.columns if 'f' in col]

cont_features =[]
disc_features =[]

for col in features:
    if train[col].dtype=='float64':
        cont_features.append(col)
    else:
        disc_features.append(col)

In [None]:
train[cont_features] = train[cont_features].astype('float32')
train[disc_features] = train[disc_features].astype('uint8')

test[cont_features] = test[cont_features].astype('float32')
test[disc_features] = test[disc_features].astype('uint8')

In [None]:
target_train = train['target']
train = train.drop('target', axis=1) 

In [None]:
cols = disc_features.copy()
cols.remove('f22')
cols.remove('f43')
train['disc_sum'] = train[cols].sum(axis=1)
test['disc_sum'] = test[cols].sum(axis=1)

disc_features += ['disc_sum']

In [None]:
cols_ovr = [f'{col}_ovr' for col in cont_features]
train[cols_ovr] = (train[cont_features] > train[cont_features].mean()).astype('uint8')
test[cols_ovr] = (test[cont_features] > test[cont_features].mean()).astype('uint8')

disc_features += cols_ovr

In [None]:
features = disc_features + cont_features

In [None]:
display(train.info())
display(train[features].head())

In [None]:
display(test.info())
display(test[features].head())

# PCA

He utilize PCA in order to make the data more compact and not run out of memory. However, this made it lower it's accuracy so we commented it. 

In [None]:
'''from sklearn.decomposition import PCA
pca = PCA(n_components=400, random_state=1)
pca.fit(train)
x_pca = pca.transform(train)
test_pca = pca.transform(test)
x_pca = pd.DataFrame(x_pca)
x_pca.head()'''

# Logistic Regression

For this section we use a simple Logistic regression paires with k-fold. This last technique is not useful really since we have enough data but it's nice to play with it. 

In [None]:
'''from sklearn.linear_model import LogisticRegression
n_splits = 2
kf = StratifiedKFold(n_splits=n_splits,random_state=1,shuffle=True)

lr = LogisticRegression(n_jobs = -1, random_state = 42, C = 5, max_iter = 2000)
y_test_pred = np.zeros(test_pca.shape[0])
print(y_test_pred.shape)
for i, (train_idx, test_idx) in enumerate(kf.split(x_pca, target_train)):
    
    x_train = x_pca.iloc[train_idx]
    x_val = x_pca.iloc[test_idx]
    
    y_train = target_train.iloc[train_idx]
    y_val = target_train.iloc[test_idx]
    lr.fit(x_train, y_train)
    
    y_pred = lr.predict_proba(x_val)[:, 1]
    y_test_pred += lr.predict_proba(test_pca)[:, 1]/n_splits

    auc = roc_auc_score(y_val, y_pred)
    print('Fold', i, 'AUC :', auc)'''

# LightGBM
---

In [None]:
lgb_params = {
    'objective': 'binary',
    'n_estimators': N_ESTIMATORS,
    'random_state': SEED,
    'learning_rate': 8e-3,
    'subsample': 0.6,
    'subsample_freq': 1,
    'colsample_bytree': 0.4,
    'reg_alpha': 10.0,
    'reg_lambda': 1e-1,
    'min_child_weight': 256,
    'min_child_samples': 20,
    'categorical_feature': len(disc_features),
}

## Cross validation

This is commented because we have enough data and don't really benefit too much from Kfold.

In [None]:
'''lgb_oof = np.zeros(train.shape[0])
lgb_pred = np.zeros(test.shape[0])
lgb_importances = pd.DataFrame()

x_pca = train
test_pca =  test

n_splits = 2

kf = StratifiedKFold(n_splits=n_splits,random_state=1,shuffle=True)

for i, (train_idx, test_idx) in enumerate(kf.split(x_pca, target_train)):
    print(f"===== fold {i} =====")
    x_train = x_pca.iloc[train_idx]
    x_val = x_pca.iloc[test_idx]
    
    y_train = target_train.iloc[train_idx]
    y_val = target_train.iloc[test_idx]
    
    start = time.time()
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(
        x_train, 
        y_train,
        eval_set=[(x_val, y_val)],
        eval_metric='auc',
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose=VERBOSE,
    )
    
    fi_tmp = pd.DataFrame()
    fi_tmp['feature'] = model.feature_name_
    fi_tmp['importance'] = model.feature_importances_
    fi_tmp['fold'] = i
    fi_tmp['seed'] = SEED
    lgb_importances = lgb_importances.append(fi_tmp)

    lgb_oof[test_idx] = model.predict_proba(x_val)[:, -1]
    lgb_pred += model.predict_proba(test_pca)[:, -1]/n_splits

    elapsed = time.time() - start
    auc = roc_auc_score(y_val, lgb_oof[test_idx])
    print(f"fold {i} - lgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")

print(f"oof lgb roc = {roc_auc_score(train[TARGET], lgb_oof)}")

np.save("lgb_oof.npy", lgb_oof)
np.save("lgb_pred.npy", lgb_pred)'''

In [None]:


x_train, x_val, y_train, y_val = train_test_split(
    train, target_train, test_size=0.2, random_state=42)

lgb_oof = np.zeros(x_val.shape[0])
lgb_pred = np.zeros(test.shape[0])
lgb_importances = pd.DataFrame()

start = time.time()
model = lgb.LGBMClassifier(**lgb_params)
model.fit(
    x_train, 
    y_train,
    eval_set=[(x_val, y_val)],
    eval_metric='auc',
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    verbose=VERBOSE,
)

fi_tmp = pd.DataFrame()
fi_tmp['feature'] = model.feature_name_
fi_tmp['importance'] = model.feature_importances_
fi_tmp['seed'] = SEED
lgb_importances = lgb_importances.append(fi_tmp)

lgb_oof = model.predict_proba(x_val)[:, -1]
lgb_pred += model.predict_proba(test)[:, -1]

elapsed = time.time() - start
auc = roc_auc_score(y_val, lgb_oof)
print(f"lgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")

print(f"oof lgb roc = {roc_auc_score(y_val, lgb_oof)}")

np.save("lgb_oof.npy", lgb_oof)
np.save("lgb_pred.npy", lgb_pred)

# Submission

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')
submission[TARGET] = lgb_pred
submission.to_csv("submission.csv", index=False)

## Feature importance

In [None]:
'''order = list(lgb_importances.groupby('feature').mean().sort_values('importance', ascending=False).index)

fig = plt.figure(figsize=(16, 32), tight_layout=True)
sns.barplot(x="importance", y="feature", data=lgb_importances.groupby('feature').mean().reset_index(), order=order)
plt.title("LightGBM feature importances")'''