# Tabular Playground Series - Sep 2021 : LightAutoML Baseline

In [None]:
!pip install --upgrade --no-cache-dir --quiet lightautoml

In [None]:
import pandas as pd
from lightautoml.automl.presets.tabular_presets import (
    TabularAutoML,
    TabularUtilizedAutoML,
)
from lightautoml.tasks import Task
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

---

## Load data

In [None]:
data = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")

### Data info

In [None]:
data.info()

In [None]:
data.head(3)

In [None]:
data.describe()

In [None]:
data["claim"].value_counts()

---

## LightAutoML

In this notebook we will use cross-validation for 10 folds with a timeout of 1200 seconds. 

In [None]:
RANDOM_STATE = 42
TIMEOUT = 7200
N_FOLDS = 10

### Step 0. Split data for train and validation

In [None]:
data_train, data_valid = train_test_split(
    data,
    test_size=0.2,
    stratify=data["claim"],
    random_state=RANDOM_STATE,
)

### Step 1. Create metrics for LightAutoML

In [None]:
def f1_prob_score(y_true, y_pred):
    return f1_score(y_true, (y_pred > 0.5).astype(int))


def accuracy_prob_score(y_true, y_pred):
    return accuracy_score(y_true, (y_pred > 0.5).astype(int))

### Step 2. Create task

Specify task (binary classification, multiclass classification, regression), metrics, losses ([doc](https://lightautoml.readthedocs.io/en/latest/generated/lightautoml.tasks.base.Task.html#lightautoml.tasks.base.Task)).

In [None]:
task = Task("binary", metric=f1_prob_score)

### Step 3. Setup roles

Role contains information about the column, which determines how it is processed ([doc](https://lightautoml.readthedocs.io/en/latest/_modules/lightautoml/dataset/roles.html)).

In [None]:
roles = {"target": "claim", "drop": ["id"]}

### Step 4. Setup TabularAutoML, predict to valid data and check scores

Classic preset - work with tabular data. Supported data roles - numbers, dates, categories ([src](https://github.com/sberbank-ai-lab/LightAutoML/blob/master/lightautoml/automl/presets/tabular_presets.py)).

In [None]:
automl = TabularAutoML(
    task=task,
    timeout=TIMEOUT,
    cpu_limit=4,
    general_params={
        "use_algos": [
            ["lgb", "lgb_tuned", "linear_l2", "cb", "cb_tuned"],
        ],
    },
    reader_params={
        "cv": N_FOLDS,
        "random_state": RANDOM_STATE,
    },
)

In [None]:
oof_pred = automl.fit_predict(data_train, roles=roles)

In [None]:
pred_valid = automl.predict(data_valid)

print("Accuracy")
print("OOF:", accuracy_prob_score(data_train["claim"].values, oof_pred.data[:, 0]))
print("VAL:", accuracy_prob_score(data_valid["claim"].values, pred_valid.data[:, 0]))

### Step 5. Setup TabularUtilizedAutoML, predict to valid data and check scores

TabularAutoML with TimeUtilization ([src](https://github.com/sberbank-ai-lab/LightAutoML/blob/master/lightautoml/automl/presets/tabular_presets.py)).

In [None]:
automl = TabularUtilizedAutoML(
    task=task,
    timeout=TIMEOUT,
    cpu_limit=4,
    general_params={
        "use_algos": [
            ["lgb", "lgb_tuned", "linear_l2", "cb", "cb_tuned"],
            ["lgb", "lgb_tuned", "linear_l2", "cb", "cb_tuned"],
        ],
    },
    reader_params={
        "cv": N_FOLDS,
        "random_state": RANDOM_STATE,
    },
)

In [None]:
oof_pred = automl.fit_predict(data_train, roles=roles)

In [None]:
pred_valid = automl.predict(data_valid)

print("Accuracy")
print("OOF:", accuracy_prob_score(data_train["claim"].values, oof_pred.data[:, 0]))
print("VAL:", accuracy_prob_score(data_valid["claim"].values, pred_valid.data[:, 0]))

### Step 6. Fit TabularUtilizedAutoML on full train data and predict to test data

In [None]:
automl = TabularUtilizedAutoML(
    task=task,
    timeout=TIMEOUT,
    cpu_limit=4,
    general_params={
        "use_algos": [
            ["lgb", "lgb_tuned", "linear_l2", "cb", "cb_tuned"],
            ["lgb", "lgb_tuned", "linear_l2", "cb", "cb_tuned"],
        ],
    },
    reader_params={
        "cv": N_FOLDS,
        "random_state": RANDOM_STATE,
    },
)

oof_pred = automl.fit_predict(data, roles=roles)

pred_test = automl.predict(test)

---

## Write submission

In [None]:
submission = pd.DataFrame({"id": test.id, "claim": (pred_test.data[:, 0] > 0.5).astype(int)})

submission.to_csv("submission.csv", index=False)