<a id="inicio"></a>
- [1 Definition](#1)
- [2 EDA](#2)
- [3 LightAutoML](#3)


<a id="1"></a>
# <p style="font-family:newtimeroman; font-size:160%;">Definition</p>
<p style="font-family:newtimeroman; font-size:130%;">The dataset is used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with calculating the loss associated with a loan defaults. Although the features are anonymized, they have properties relating to real-world features.</p> defaults. Although the features are anonymized, they have properties relating to real-world features.

In [None]:
!pip install lightautoml -q

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

sns.set(rc={'figure.figsize':(16,8)})
sns.set(font_scale=1.3)
plt.style.use('fivethirtyeight')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

In [None]:
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')

<a id="2"></a>
# <p style="font-family:newtimeroman; font-size:160%;">EAD</p>

In [None]:
train.head()

In [None]:
train.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                             .background_gradient(subset=['50%'], cmap='coolwarm')

<p style="font-family:newtimeroman; font-size:130%;">Some variables have a very high standard deviation.</p>

In [None]:
# Function to calculate percentage

def plt_percente(plot, feature):
    total= len(feature)
    for p in plot.patches:
        percentage = "{:.1f}%".format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() / 2 - 0.05
        y = p.get_y() + p.get_height()
        ax.annotate(percentage, (x,y), size=12, rotation=45)
    plt.show()

In [None]:
# Calculating the percentage of target variables 

fig, ax = plt.subplots(figsize=(16, 8))

ax.bar(train["loss"].value_counts().sort_index().index,
              train["loss"].value_counts().sort_index().values,
              edgecolor="black")

ax.set_title("Target distribution", fontsize=20, pad=15)
ax.set_ylabel("Amount of values", fontsize=14, labelpad=15)
plt_percente(ax, train.loss)
plt.show();

In [None]:
# Distribution of variables

columns = train.columns[1:101]
plt.subplots(figsize=(16,150))
length = len(columns)

for i, j in zip(columns, range(length)):
    fig = plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.2, hspace=.5)
    sns.histplot(x=train[i], color='orange',edgecolor='black')
    sns.histplot(x=test[i], color='green',edgecolor='black')
    fig.legend(labels=('Train','Test'))

<p style="font-family:newtimeroman; font-size:130%;">Analyzing the graph shows that some variables have outlier values and others have spaced values, which may not be good for the models.</p>

<a id="3"></a>
# <p style="font-family:newtimeroman; font-size:160%;">LightAutoML</p>

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

In [None]:
def rmse(y_true, y_pred, **kwargs):
    return mean_squared_error(y_true, y_pred, squared = False, **kwargs)

In [None]:
THREADS = 12
FOLDS = 10
RANDOM = 42
SIZE = 0.2
TIMEOUT = 3600
TARGET = 'loss'

In [None]:
df_train, df_test = train_test_split(train, test_size=SIZE, stratify=train[TARGET],
                                    random_state=RANDOM)

In [None]:
task = Task('reg',)

In [None]:
roles = {'target':TARGET, 'drop':['id']}

In [None]:
lgb_params = {
    'metric': 'RMSE',
    'lambda_l1': 1e-07, 
    'lambda_l2': 2e-07, 
    'num_leaves': 42, 
    'feature_fraction': 0.55, 
    'bagging_fraction': 0.9, 
    'bagging_freq': 3, 
    'min_child_samples': 19,
    'num_threads': 12
}

cb_params = {
    'num_trees': 7000, 
    'od_wait': 1200, 
    'learning_rate': 0.02, 
    'l2_leaf_reg': 64, 
    'subsample': 0.83, 
    'random_strength': 17.17, 
    'max_depth': 8, 
    'min_data_in_leaf': 10, 
    'leaf_estimation_iterations': 3,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'bootstrap_type': 'Bernoulli',
    'leaf_estimation_method': 'Newton',
    'random_seed': 42,
    "thread_count": 12
}

In [None]:
%%time
automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = THREADS,
                       reader_params = {'n_jobs': THREADS, 'cv': FOLDS, 'random_state': RANDOM},
                       general_params = {'use_algos': [['lgb', 'cb']]},
                       lgb_params = {'default_params': lgb_params, 'freeze_defaults': True}, 
                       cb_params = {'default_params': cb_params, 'freeze_defaults': True}, 
                       verbose = 2 
                      )

pred_ = automl.fit_predict(df_train, roles = roles)

In [None]:
pred = automl.predict(test)

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

In [None]:
submission[TARGET] = pred.data[:, 0]

In [None]:
submission.to_csv('submission1.csv', index = False)