In [None]:
!pip -q install -U lightautoml

# Step 0.1. Import libraries

Here we will import the libraries we use in this kernel:
- Standard python libraries for timing, working with OS etc.
- Essential python DS libraries like numpy, pandas, scikit-learn and torch (the last we will use in the next cell)
- LightAutoML modules: presets for AutoML, task and report generation module

In [None]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

# Step 0.2. Constants

Here we setup the constants to use in the kernel:
- `N_THREADS` - number of vCPUs for LightAutoML model creation
- `N_FOLDS` - number of folds in LightAutoML inner CV
- `RANDOM_STATE` - random seed for better reproducibility
- `TEST_SIZE` - houldout data part size 
- `TIMEOUT` - limit in seconds for model to train
- `TARGET_NAME` - target column name in dataset

In [None]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 7*3600*2
TARGET_NAME = 'claim'

# Step 0.3. Imported models setup

For better reproducibility we fix numpy random seed with max number of threads for Torch (which usually try to use all the threads on server):

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Data loading
Let's check the data we have:

In [None]:
%%time

train_data = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
train_data.head()

In [None]:
test_data = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
test_data.head()

In [None]:
samp_sub = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
samp_sub.head()

In [None]:
features = test_data.columns

In [None]:
train_data['n_missing'] = train_data[features].isna().sum(axis=1)
test_data['n_missing'] = test_data[features].isna().sum(axis=1)

train_data['mean'] = train_data[features].mean(axis=1)
test_data['mean'] = test_data[features].mean(axis=1)

train_data['median'] = train_data[features].median(axis=1)
test_data['median'] = test_data[features].median(axis=1)

#train_data['std'] = train_data[features].std(axis=1)
#test_data['std'] = test_data[features].std(axis=1)

train_data['var'] = train_data[features].var(axis=1)
test_data['var'] = test_data[features].var(axis=1)

train_data['skew'] = train_data[features].skew(axis=1)
test_data['skew'] = test_data[features].skew(axis=1)

train_data['kurtosis'] = train_data[features].kurtosis(axis=1)
test_data['kurtosis'] = test_data[features].kurtosis(axis=1)

train_data['10%'] = train_data[features].quantile(q=0.1, axis=1)
test_data['10%'] = test_data[features].quantile(q=0.1, axis=1)

train_data['25%'] = train_data[features].quantile(q=0.25, axis=1)
test_data['25%'] = test_data[features].quantile(q=0.25, axis=1)

train_data['75%'] = train_data[features].quantile(q=0.75, axis=1)
test_data['75%'] = test_data[features].quantile(q=0.75, axis=1)

#train_data['iqr']=train_data['75%'] - train_data['25%']
#test_data['iqr'] = test_data['75%'] - test_data['25%']

train_data['90%'] = train_data[features].quantile(q=0.9, axis=1)
test_data['90%'] = test_data[features].quantile(q=0.9, axis=1)

# Step 0.6. Add OOFs and Test predictions from AutoWoE models

In [None]:
from autowoe import AutoWoE
from sklearn.model_selection import StratifiedKFold
from scipy.stats import rankdata

def get_oof_and_test_pred(tr, real_te):
    skf = StratifiedKFold(n_splits=3, random_state = RANDOM_STATE)

    oof_preds_woe = np.zeros(len(tr))
    real_test_preds_woe = np.zeros(len(real_te))

    y = tr[TARGET_NAME].values

    for fold, (train_idx, val_idx) in enumerate(skf.split(y, y)):

        X_tr, X_val = tr.iloc[train_idx, :], tr.iloc[val_idx, :]

        auto_woe = AutoWoE(monotonic=False,
                         vif_th=20.,
                         imp_th=0,
                         th_const=32,
                         force_single_split=True,
                         min_bin_size = 0.005,
                         oof_woe=True,
                         n_folds=5,
                         n_jobs=N_THREADS,
                         regularized_refit=True,
                         verbose=0)
        _, X_tr = train_test_split(X_tr, test_size = 200000, random_state = RANDOM_STATE, stratify = X_tr[TARGET_NAME].values)
        print(X_tr.shape)
        auto_woe.fit(X_tr.drop('id', axis = 1), 
                     target_name="claim")

        val_pred = auto_woe.predict_proba(X_val)
        print("FOLD {}, AUC_SCORE = {:.5f}".format(fold, roc_auc_score(X_val['claim'], val_pred)))

        oof_preds_woe[val_idx] = val_pred
        real_test_preds_woe += auto_woe.predict_proba(real_te) / N_FOLDS

    print("AUC_SCORE TRAIN = {:.5f}".format(roc_auc_score(y, oof_preds_woe)))
    
    return oof_preds_woe, real_test_preds_woe

In [None]:
oof_preds_woe, real_test_preds_woe = get_oof_and_test_pred(train_data, test_data)


oof_preds_woe2, real_test_preds_woe2 = get_oof_and_test_pred(train_data, test_data)

In [None]:
train_data['oof_woe_1'] = oof_preds_woe
test_data['oof_woe_1'] = real_test_preds_woe

train_data['oof_woe_2'] = oof_preds_woe2
test_data['oof_woe_2'] = real_test_preds_woe2

train_data['oof_woe_12'] = 0.5 * oof_preds_woe + 0.5 * oof_preds_woe2
test_data['oof_woe_12'] = 0.5 * real_test_preds_woe + 0.5 * real_test_preds_woe2

train_data['rank_oof_woe_1'] = rankdata(oof_preds_woe)
test_data['rank_oof_woe_1'] = rankdata(real_test_preds_woe)

train_data['rank_oof_woe_2'] = rankdata(oof_preds_woe2)
test_data['rank_oof_woe_2'] = rankdata(real_test_preds_woe2)

train_data['rank_oof_woe_12'] = 0.5 * rankdata(oof_preds_woe) + 0.5 * rankdata(oof_preds_woe2)
test_data['rank_oof_woe_12'] = 0.5 * rankdata(real_test_preds_woe) + 0.5 * rankdata(real_test_preds_woe2)

# =========== LightAutoML model building ===========


# Step 1. Task setup

On the cell below we create Task object - the class to setup what task LightAutoML model should solve with specific loss and metric if necessary (more info can be found [here](https://lightautoml.readthedocs.io/en/latest/generated/lightautoml.tasks.base.Task.html#lightautoml.tasks.base.Task) in our documentation):

In [None]:
%%time

task = Task('binary', )

# Step 2. Feature roles setup

To solve the task, we need to setup columns roles. The **only role you must setup is target role**, everything else (drop, numeric, categorical, group, weights etc.) is up to user - LightAutoML models have automatic columns typization inside:

In [None]:
%%time

roles = {'target': TARGET_NAME,
         'drop': ['id']
         }

# Step 6. Retrain on the full dataset

In [None]:
%%time 

automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_time': 1900}, # more time for params tuning
                       general_params = {'use_algos': [['lgb', 'lgb_tuned']]},
                       selection_params = {'mode': 0} # no feature selection - everything is necessary :)
                      )

oof_pred = automl.fit_predict(train_data, roles = roles)

In [None]:
test_pred = automl.predict(test_data)
print('Prediction for test_data:\n{}\nShape = {}'.format(test_pred, test_pred.shape))

# Step 7. Create submission file

In [None]:
samp_sub[TARGET_NAME] = test_pred.data[:, 0]
samp_sub.to_csv('In_LightAutoML_we_trust.csv', index = False)

# Additional materials

- [Official LightAutoML github repo](https://github.com/sberbank-ai-lab/LightAutoML)
- [LightAutoML documentation](https://lightautoml.readthedocs.io/en/latest)