In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os, gc, warnings
import random
import datetime

from tqdm.notebook import tqdm
# matplotlib and seaborn for plotting
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
import sklearn

import lightgbm as lgb

import pickle

In [None]:
path = '../input/tabular-playground-series-aug-2021'
# Input data files are available in the "../input/" directory.
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
MULTY_STD = 8

# Load data and display samples

In [None]:
target = 'loss'

def load_data(source='train', path=path):
    ''' load tables '''
    assert source in ['train', 'test']
    df = pd.read_csv(f'{path}/{source}.csv', index_col="id")
    return df

In [None]:
%%time
train = load_data('train')
print(f"Data shape: {train.shape}")
train.sample(4)

In [None]:
%%time
test = load_data('test')
print(f"Data shape: {test.shape}")
test.sample(2)

# Some initial analize

## Check if in the data set there is missing data.

In [None]:
def missing_statistics(df):    
    statitics = pd.DataFrame(df.isnull().sum()).reset_index()
    statitics.columns=['COLUMN NAME',"MISSING VALUES"]
    statitics['TOTAL ROWS'] = df.shape[0]
    statitics['% MISSING'] = round((statitics['MISSING VALUES']/statitics['TOTAL ROWS'])*100,2)
    return statitics

In [None]:
miss = missing_statistics(train)
miss

In [None]:
# The dt is 
print(f"Count of missing values in any row: {miss['MISSING VALUES'].sum()}")
del miss

## Discrete features
* Some data are found to have no decimal point

In [None]:
discrete_features = []

for col in train.columns:
    if np.array_equal(train[col].values, train[col].values.astype(int)):
        discrete_features.append(col)

print(f'Total {len(discrete_features)} : ')
print(discrete_features)

In [None]:
for dcol in discrete_features:
    print(f'{dcol} unique value : {train[dcol].nunique()}')

- But the remaining `f1, f27, f55, and f86` look relatively categorical.

In [None]:
cat_features = ['f1', 'f27', 'f55', 'f86']

## Distribution Check.

In [None]:
# Dtypes are bigger than the min/max value in each row, but is not a lot of data so correcting it is not a must.
dtypes_arr = train.dtypes
col_int = []
i = 0
for t in dtypes_arr:
    if t == int:
        col_int.append(f"f{i - 1}")
    i+= 1
col_int = col_int[1:]

In [None]:
train.describe()

In [None]:
mask = train["f6"] < 0
train[mask]["f6"]

### The different columns have different distributions

In [None]:
fig, axes = plt.subplots(10,10,figsize=(12, 12))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    sns.kdeplot(data=train, x=f'f{idx}', 
                fill=True, 
                ax=ax)
    sns.kdeplot(data=test, x=f'f{idx}', 
                fill=True, 
                ax=ax)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    ax.set_title(f'f{idx}', loc='right', weight='bold', fontsize=10)

fig.supxlabel('Average by class (by feature)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15,10))

sns.histplot(train, x="loss", bins= 30, ax=axs[0][0], color='r') # skewed right distribution
sns.histplot(train, x="f6", ax=axs[0][1], color='b') # normal distribution
sns.histplot(train, x="f97", bins=30, ax=axs[1][0], color='g') # uniform distribution
sns.histplot(train, x="f0", bins=30, ax=axs[1][1], color='y') # uniform distribution

fig.legend(labels=['skewed right distribution','skewed right distribution', 'normal distribution', 'uniform distribution'])
plt.show()

### Loss column reflexion:
* Has a skewed right distribution, the 75% of id are under 10 units of loss. 

### Df parameters
* With a visual inspection, there are a lot of skewed distributions.
* So in skewed data, the tail region may act as an outlier for the statistical model and we know that **outliers adversely affect the model’s performance especially regression-based models.** There are statistical model that are robust to outlier like a **Tree-based models** but it will limit the possibility to try other models. So there is a necessity to transform the skewed data to close enough to a Gaussian distribution or Normal distribution. This will allow us to try more number of statistical model.

### Appliying a log distribuiton to the input data
* A common technique for handling negative values is to add a constant value to the data prior to applying the log transform. The transformation is therefore log(Y+a) where a is the constant. Some people like to choose a so that min(Y+a) is a very small positive number (like 0.001). Others choose a so that min(Y+a) = 1. For the latter choice, you can show that a = b – min(Y), where b is either a small number or is 1.

* In this case, some columns with right skewed distribution has values lower than -1, therefore if we apply `np.log1p()` the values lower than -1 will be converted to Nan.
    * Thus we are going to use **Yeo-Johnson power transformation** useful for negative values.
* Procedure:
    1. First identify wich are the columns that presents **skewed distribution** and save them in array.
    2. Then apply Yeo-Johnson algorithm.

In [None]:
def plt_distribution_vs_YeoJohnson(df_in, col_name):
    df = df_in.copy()
    
    fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15,10))

    # plt1
    x = df[col_name]

    ax00 = fig.add_subplot(axs[0][0])
    prob = stats.probplot(x, dist=stats.norm, plot=ax00)
    ax00.set_xlabel('')
    ax00.set_title('Probplot against normal distribution')

    ax01 = fig.add_subplot(axs[0][1])
    sns.histplot(x, color='b', ax=ax01) # normal distribution
    ax01.set_title('Histogram against normal distribution')

    # plt2
    xt, lmbda = stats.yeojohnson(df[col_name])

    ax10 = fig.add_subplot(axs[1][0])
    prob = stats.probplot(xt, dist=stats.norm, plot=ax10)
    ax10.set_title('Probplot after Yeo-Johnson transformation')

    ax11 = fig.add_subplot(axs[1][1])
    sns.histplot(xt, color='b', ax=ax11) # normal distribution
    ax11.set_title('Histogram against normal distribution')

    # Plot
    plt.show()

In [None]:
plt_distribution_vs_YeoJohnson(train, "f6")

In [None]:
plt_distribution_vs_YeoJohnson(train, "f97")

In [None]:
plt_distribution_vs_YeoJohnson(train, "f0")

### YeoJohnson inspection
* As we can see in the plots above, Yeo-Johnson power transformation is only afecting to **skewed distribution** so we will apply this transformation for all the input columns.

In [None]:
%%time
features = [col for col in train.columns if col not in ['loss']]
train_transform_dic = {}
test_transform_dic = {}
for col_name in features:
    train_data, fitted_lambda = stats.yeojohnson(train[[col_name]])
    test_data = stats.yeojohnson(test[col_name], fitted_lambda)
    
    train_transform_dic[col_name] = train_data.reshape(-1)
    test_transform_dic[col_name] = test_data.reshape(-1)
    
train_transform = pd.DataFrame(data=train_transform_dic)
train_transform["loss"] = train["loss"]            
test_transform = pd.DataFrame(data=test_transform_dic)

# Deal with outlayers

In [None]:
sns.histplot(train_transform, x="f6", color='b')

In [None]:
std = 6.533943
mean = 11.655937
less_than = mean - (5*std)
higher_than = mean + (5*std)
print("In this column if the data lower than 8 * std, will be considered as outlayer")
print(f"Values less than: {less_than} are outlayers")
print(f"Values higher than: {higher_than} are outlayers")

In [None]:
train_transform.describe()

In [None]:
hex(id(train_transform))

In [None]:
def outlayer_handel(df_in, features, multy_std=8):
    df = df_in.copy() # make a copy of the variable in other memory position.
    
    mean_arr = df[features].describe().values[1]
    std_arr = df[features].describe().values[2]
 
    less_than_arr = mean_arr - (multy_std*std_arr)
    higher_than_arr = mean_arr + (multy_std*std_arr)
    
    mask_less_than_arr = df[features] < less_than_arr
    mask_higher_than_arr = df[features] > higher_than_arr
    mask = np.logical_or(mask_less_than_arr, mask_higher_than_arr)
    
    for col_name in features:
        df.loc[mask[col_name], [col_name]] = df[col_name].describe().values[1]

    return df

In [None]:
train_transform_notlayers = outlayer_handel(train_transform, features, multy_std=MULTY_STD)

In [None]:
fig, axs = plt.subplots(ncols=3, figsize=(15,10))

sns.histplot(train, x="f6", color='b', ax=axs[0])
sns.histplot(train_transform, x="f6", color='r', ax=axs[1])
sns.histplot(train_transform_notlayers, x="f6", color='r', ax=axs[2])

# Confusion matrix I don't see correlations between columns at all.

In [None]:
%%time
fig, ax = plt.subplots(1, 1, figsize=(12 , 12))

corr = train.corr()

mask = np.zeros_like(corr, dtype=np.bool_)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr, ax=ax,
        square=True, center=0, linewidth=1,
        cmap=sns.diverging_palette(240, 10, as_cmap=True),
        cbar_kws={"shrink": .82},    
        mask=mask
       ) 

ax.set_title(f'Correlation', loc='left', fontweight='bold')     

plt.show()

# LGBMRegressor model prepare

In [None]:
def fit_regressor(df, tr_idx, val_idx, features_arr, target_str):
    # train
    tr_x, tr_y = df[features_arr].iloc[tr_idx], df[target_str][tr_idx]
    # evaluating ("test")
    vl_x, vl_y = df[features_arr].iloc[val_idx], df[target_str][val_idx]
    print({'df size':len(tr_x), 'eval size':len(vl_x)})

    clf = lgb.LGBMRegressor(n_estimators=6000,
                            learning_rate=0.01,
                            feature_fraction=0.9,
                            subsample=0.2,  # batches of 20% of the data
                            subsample_freq=1,
                            num_leaves=20,
                            metric='rmse')
    # Metric: Root Mean Square Error (RMSE), it tells you how concentrated the data is around the line of best fit.
    clf.fit(tr_x, tr_y,
            eval_set=[(vl_x, vl_y)],
            early_stopping_rounds=150,
            verbose=200)
    return clf

In [None]:
features = [col for col in train.columns if col not in ['loss']]
folds = 4
seed = 20
#kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed) # Provides train/test indices to split data in train/test sets.
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
# oof_pred = np.zeros(train_transform.shape[0])  # out of fold predictions
models = []

## generating 4 train/test pair of index_arrays, and analizing wich give the better results.
for tr_idx, val_idx in tqdm(kf.split(train_transform_notlayers), total=folds): # train/test indices
    clf = fit_regressor(train_transform_notlayers, tr_idx, val_idx, features, target)
    models.append(clf)

gc.collect() # trigger a manual garbage collection process, cleans up a huge amount of objects.

# Algorithm ID3 regresion
1. Calculate the initial system entropy based on the **objective** variable to predict.
    * Entropy: Determine wich parameters are more important than others to have a better sort in the tree.

In [None]:
_ = lgb.plot_importance(models[1], importance_type='gain', figsize=(20,20))

# Check in prediction sample

In [None]:
test_transform_notlayers = outlayer_handel(test_transform, features, multy_std=MULTY_STD)

In [None]:
fold_preds = [model.predict(test_transform_notlayers) for model in models]
out_loss = np.mean(fold_preds, axis=0) # Using all the models and making the mean between each other.

# Save submision

In [None]:
submission = pd.read_csv(f'{path}/sample_submission.csv')
submission['loss'] = out_loss

In [None]:
submission

In [None]:
submission.to_csv(f'./submission.csv', index=False)
submission.head(9)