## TPS October 2021 - LightGBM base model

## Import libraries

In [None]:
%%time

import os
import logging
import sys
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd

from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

from lightgbm import LGBMClassifier

## Load datasets

In [None]:
%%time

folds_dir = "../input/tps-october-2021-skfolds/"
data_dir = "../input/tabular-playground-series-oct-2021/"

train  = pd.read_csv(folds_dir + "train_folds.csv")
test = pd.read_csv(data_dir + "test.csv")
submission = pd.read_csv(data_dir + "sample_submission.csv")

TARGET = "target"

In [None]:
%%time

train.head()

In [None]:
%%time

test.head()

In [None]:
%%time

features = [col for col in train.columns if col not in ["id", "kfold", TARGET]]

print(f"All features {len(features)}:")
for feat in features:
    print(feat, end=" ")

In [None]:
%%time

cont_features = []
cat_features = []

for feat in features:
    if "int" in str(train[feat].dtype):
        cat_features.append(feat)
    else:
        cont_features.append(feat)

print(f"Categorical features ({len(cat_features)}):")
for feat in cat_features:
    print(feat, end=" ")

print(f"\n\nContinuous features ({len(cont_features)}):")
for feat in cont_features:
    print(feat, end=" ")

In [None]:
%%time

print("Train info:")
print(train.info(), end='\n\n')

print("Test info:")
print(test.info(), end='\n\n')

In [None]:
%%time

## https://www.kaggle.com/hiro5299834/tps-oct-2021-single-lightgbm

train[cont_features] = train[cont_features].astype("float32")
train[cat_features] = train[cat_features].astype("uint8")

test[cont_features] = test[cont_features].astype("float32")
test[cat_features] = test[cat_features].astype("uint8")

In [None]:
%%time

print("Train info:")
print(train.info(), end='\n\n')

print("Test info:")
print(test.info(), end='\n\n')

## Preprocessing

In [None]:
%%time

pl = Pipeline([
    ("scaler", RobustScaler())
])

train[features] = pl.fit_transform(train[features])
test[features] = pl.transform(test[features])

## Predict

In [None]:
%%time

def predict(train, test, folds=5):
    test_preds = []
    valid_preds = {}
    scores = []
    
    params = {
        "random_state": 42,
        "n_estimators": 1000,
        "objective" : "binary",
        "metric" : "auc",
    }
    
    for fold in range(folds):
        x_train = train[train.kfold != fold].reset_index(drop=True)
        x_valid = train[train.kfold == fold].reset_index(drop=True)
        x_test = test.copy()
        
        valid_ids = x_valid.id.values.tolist()

        y_train = x_train[TARGET]
        y_valid = x_valid[TARGET]

        x_train = x_train[features]
        x_valid = x_valid[features]

        model = LGBMClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
            eval_metric="auc",
            early_stopping_rounds=150,
            verbose=1000
        )
        
        valid_pred = model.predict_proba(x_valid)[:, 1]
        test_pred = model.predict_proba(x_test)[:, 1]
        
        test_preds.append(test_pred)
        valid_preds.update(dict(zip(valid_ids, valid_pred)))

        score = roc_auc_score(y_valid, valid_pred)
        print(f"Fold {fold} | AUC: {score}")
        scores.append(score)
    
    test_preds = np.mean(np.column_stack(test_preds), axis=1)
    valid_preds = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
    
    return test_preds, valid_preds, scores

In [None]:
%%time

test_preds, valid_preds, scores = predict(train, test[features])
print(np.mean(scores), np.std(scores))

## Save

In [None]:
%%time

valid_preds.columns = ["id", "lgb_pred_1"]
valid_preds.to_csv("lgb_train_1.csv", index=False)

test_preds_df = pd.DataFrame({"id": submission.id, "lgb_pred_1": test_preds})
test_preds_df.to_csv("lgb_test_1.csv", index=False)

sub = pd.DataFrame({"id": submission.id, TARGET: test_preds})
sub.to_csv("submission.csv", index=False)