In [None]:
%%capture
!pip install -U lightautoml

In [None]:
import os
import time

import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split
import torch

from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

In [None]:
N_THREADS = 40
N_FOLDS = 80
RANDOM_STATE = 42
TIMEOUT = 2 * 3600
TARGET_NAME = 'target'

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [None]:
INPUT_DIR = '../input/tabular-playground-series-feb-2022/'

In [None]:
train_data = pd.read_csv(INPUT_DIR + 'train.csv')

In [None]:
test_data = pd.read_csv(INPUT_DIR + 'test.csv')

In [None]:
submission = pd.read_csv(INPUT_DIR + 'sample_submission.csv')

In [None]:
pseudolabels = pd.read_csv('../input/lightautoml-099026/lightautoml_07.csv')

In [None]:
test_data[TARGET_NAME] = pseudolabels[TARGET_NAME].values

In [None]:
ALL_DF = pd.concat([train_data, test_data]).reset_index(drop=True)
print(ALL_DF.shape)

In [None]:
ALL_DF['weight'] = [1.001] * len(train_data) + [0.999] * len(test_data)

In [None]:
def log_loss_metric(y_true, y_pred, sample_weight, **kwargs):
    mask = (sample_weight > 1)
    return log_loss(y_true[mask], y_pred[mask], **kwargs)

task = Task('multiclass', metric=log_loss_metric, greater_is_better=False)

In [None]:
roles = {
    'target': TARGET_NAME,
    'drop': ['row_id'],
    'weights': 'weight'
}

In [None]:
automl = TabularAutoML(
    task=task,
    timeout=TIMEOUT,
    cpu_limit=N_THREADS,
    reader_params={'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
    general_params={'use_algos': ['lgb'],
                    'nested_cv': True},
    selection_params={'mode': 1},
    nested_cv_params={'cv': 5}
)

In [None]:
%%time

oof_predict = automl.fit_predict(ALL_DF, roles=roles, verbose=3)

In [None]:
mapper = automl.reader.class_mapping
mapper

In [None]:
preds = pd.Series(np.argmax(oof_predict.data[:len(train_data), :], axis=1)).map({mapper[x]: x for x in mapper})
print(f'OOF SCORE:{np.mean(train_data[TARGET_NAME].values == preds)}')

In [None]:
test_predict = automl.predict(test_data)
print(f'PREDICT TEST_DATA:\n{test_predict}\nSHAPE: {test_predict.shape}')

In [None]:
submission[TARGET_NAME] = pd.Series(np.argmax(test_predict.data, axis=1)).map({mapper[x]: x for x in mapper})
submission.to_csv('lightautoml_12.csv', index=False)
submission