# Tabular Playground Series - Aug 2021
A quick review of the data, predict with LightAutoML

## Setup

In [None]:
!pip install -U https://github.com/sberbank-ai-lab/LightAutoML/raw/fix/logging/LightAutoML-0.2.16.2-py3-none-any.whl

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import missingno as msno
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')

## Overview
There are no missing values and there seems to be no correlation with loss for any of the features. It's so abstract, I don't even know where to begin! I would like to gradually investigate this from now on.I'm not sure we'll know anything within a month.

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
msno.matrix(df_train)

In [None]:
plt.figure(figsize=(16,16))
sns.heatmap(df_train.corr(),cmap='PuRd')
plt.show()

In [None]:
cols = df_train.columns.values
fig, ax = plt.subplots(25, 4, figsize=(16,100))
cnt = 0
for i in cols:
    if i == 'loss':
        break
    elif i == 'id':
        continue
    else:
        sns.histplot(df_train[i], ax=ax[cnt//4, cnt%4], color='lightskyblue')
        cnt += 1
    
plt.show()

## Predict with LightAutoML
Special thanks to [this notebook](https://www.kaggle.com/alexryzhkov/aug21-lightautoml-starter).

In [None]:
lgb_params = {
    'metric': 'RMSE',
    'lambda_l1': 1e-07, 
    'lambda_l2': 2e-07, 
    'num_leaves': 42, 
    'feature_fraction': 0.55, 
    'bagging_fraction': 0.9, 
    'bagging_freq': 3, 
    'min_child_samples': 19,
    'num_threads': 4
}

cb_params = {
    'num_trees': 7000, 
    'od_wait': 1200, 
    'learning_rate': 0.02, 
    'l2_leaf_reg': 64, 
    'subsample': 0.83, 
    'random_strength': 17.17, 
    'max_depth': 6, 
    'min_data_in_leaf': 10, 
    'leaf_estimation_iterations': 3,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'bootstrap_type': 'Bernoulli',
    'leaf_estimation_method': 'Newton',
    'random_seed': 42,
    "thread_count": 4
}

In [None]:
def do_automl(target, train, test):
    laml = TabularAutoML(task=Task('reg', loss='mse', metric='mse'),
                         timeout=3*3600, cpu_limit=4,
                         reader_params = {'n_jobs': 4, 'cv': 10, 'random_state': 42},
                         general_params = {'use_algos': [['lgb', 'cb']]},
                         lgb_params = {'default_params': lgb_params, 'freeze_defaults': True}, 
                         cb_params = {'default_params': cb_params, 'freeze_defaults': True},
                         verbose = 2
                        )
    laml.fit_predict(train_data=train, roles={'target': target })
    pred = laml.predict(test).data.ravel()
    return(pred)

In [None]:
 pred = do_automl('loss', df_train.drop(['id'], axis=1), df_test.drop(['id'], axis=1))

In [None]:
pred

In [None]:
sample_sub = pd.read_csv('../input/../input/tabular-playground-series-aug-2021/sample_submission.csv')
submission = pd.DataFrame({'id': sample_sub.id, 'loss': pred })
submission

In [None]:
submission.to_csv('LightAutoML_sub.csv',index=False)