# Tabular Playground Series - Aug 2021

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Imports

In [None]:
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import scipy.stats as stats
import statsmodels.api as sm
import xgboost as xgb

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor

from sklearn.experimental import enable_hist_gradient_boosting # experimental
from sklearn.ensemble import HistGradientBoostingRegressor

from IPython.display import display, Markdown, Latex

## Configuration

In [None]:
# matplotlib
plt.rc('font', size=15)
plt.rc('axes', titlesize=18)  
plt.rc('xtick', labelsize=10)  
plt.rc('ytick', labelsize=10)

# seaborn
sns.set_style("whitegrid")

In [None]:
class Config:
    RANDOM_STATE = 2021
    TRAIN_DATA = '../input/tabular-playground-series-aug-2021/train.csv'
    TEST_DATA = '../input/tabular-playground-series-aug-2021/test.csv'
    SUBMISSION = '../input/tabular-playground-series-aug-2021/sample_submission.csv'    
    SUBMISSION_FILE = 'submission.csv'
    TEST_SIZE = 0.3
    
    INDEX = 'id'
    TARGET = 'loss'
    FEATURES = ['f{}'.format(i) for i in range(0, 100)]
    COLUMNS = FEATURES + [TARGET]
    
    @staticmethod
    def set_seed():
        random.seed(Config.RANDOM_STATE)
        np.random.seed(Config.RANDOM_STATE)

Config.set_seed()

## Import Data

In [None]:
train_data = pd.read_csv(Config.TRAIN_DATA)
train_data.head(5)

In [None]:
test_data = pd.read_csv(Config.TEST_DATA)
test_data.head(5)

In [None]:
train_data[Config.COLUMNS].describe().T

### Notes

* The traing data set has 250000 observations with 102 features.
* The test data set has 150000 observations with 101 features.
* The `loss` cloumn is the target variable, which takes only integer values between 0 and 42.

## Exploratory data analysis (EDA)

In [None]:
train_data[Config.COLUMNS].describe().T.style.bar(
    subset=['mean'], color='Bules'
).background_gradient(subset=['50%'], cmap='Blues') # highlight median

In [None]:
def plot_pdf(
    data:pd.DataFrame, 
    feature:str, 
    title='Estimate pdf',
    ax=None):
    """ Plots the estimated pdf. 
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    # plot pdf
    sns.kdeplot(
        data=data[feature], 
        palette='Blues_r',
        cumulative=False,
        legend=True,
        ax=ax)
    
    ax.set_title(title)
    ax.set_xlabel('Feature {}'.format(feature))
    ax.set_ylabel('Density')
    
    return ax

In [None]:
def plot_cdf(
    data:pd.DataFrame, 
    feature:str, 
    title='Empirical cdf', 
    ax=None):
    """ Plots the empirical cdf. 
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    # plot pdf
    sns.kdeplot(
        data=data[feature], 
        palette='Blues_r',
        cumulative=True,
        legend=True,
        ax=ax)
    
    ax.set_title(title)
    ax.set_xlabel('Feature {}'.format(feature))
    ax.set_ylabel('Probability')
    
    return ax

In [None]:
def plot_qq(
    data:pd.DataFrame, 
    feature:str, 
    title='QQ-Plot', 
    ax=None):
    """ QQ-Plot. 
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    # qq plot
    sm.qqplot(
        data[feature], 
        line='45',
        fmt='--',
        ms=0.1,
        ax=ax)
    
    ax.set_title(title)
    return ax

In [None]:
def plot_feature_vs_target(
    data:pd.DataFrame, 
    feature:str,
    target:str=Config.TARGET,
    title='Feature vs. Target', 
    ax=None):
    """ Fetaure vs. Target scatter plot 
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1)
    
    # scatter plot
    sns.scatterplot(
        x=data[feature], 
        y=data[target], 
        ax=ax, 
        alpha=0.4)
    
    ax.set_title(title)
    ax.set_xlabel('Feature {}'.format(feature))
    ax.set_ylabel('Target')
    
    return ax

### Target `loss`

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20, 5))

plot_pdf(train_data, Config.TARGET, ax=ax[0])
plot_cdf(train_data, Config.TARGET, ax=ax[1])
plot_qq(train_data, Config.TARGET, ax=ax[2])

plt.show()

In [None]:
def plot_target(target_data, ax=None):
    """
    """
    if ax == None:
        fig, ax = plt.subplots(1, 1)

    sns.histplot(
        x=target_data,
        palette=sns.color_palette(),
        stat='probability',
        discrete=True
    )
    
    ax.set_title('Target distribution')
    ax.set_xlabel('Target values')
    
    return ax

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 7))
plot_target(train_data[Config.TARGET], ax=ax)

plt.show()

In [None]:
# Target `loss` distribution
target_distribution = pd.DataFrame({
    'count': train_data[[Config.TARGET]].value_counts(),
    'percent': np.round(train_data[[Config.TARGET]].value_counts() / train_data.shape[0], 3)
})

target_distribution

In [None]:
train_data[[Config.TARGET]].describe().T

### Notes

* The distribution of the target variable `loss` is discrete, taking values $0,\dots,42$.
* Mean $\mu=6.81392$ and std $\sigma=7.940179$.

### Features `f0` - `f99`

In [None]:
feature_info = pd.DataFrame({
    'feature': train_data[Config.FEATURES].columns,
    'skewness': train_data[Config.FEATURES].skew(),
    'kurtosis': train_data[Config.FEATURES].kurt(),
    
    
}).set_index('feature')

feature_info

In [None]:
# normalize data 
scaler = StandardScaler()

scaled_train_data = train_data.copy()
scaled_test_data = test_data.copy()

scaled_train_data[Config.FEATURES] = scaler.fit_transform(scaled_train_data[Config.FEATURES])
scaled_test_data[Config.FEATURES] = scaler.transform(scaled_test_data[Config.FEATURES])

scaled_train_data[Config.TARGET] = StandardScaler().fit_transform(scaled_train_data[[Config.TARGET]])

In [None]:
for feature in Config.FEATURES:
    display(Markdown('#### Plot feature `{}`'.format(feature)))
            
    fig, ax = plt.subplots(1, 4, figsize=(25, 5))

    plot_pdf(scaled_train_data, feature, ax=ax[0]) # train pdf
    plot_pdf(scaled_test_data, feature, ax=ax[0])  # test pdf
    plot_cdf(scaled_train_data, feature, ax=ax[1])
    plot_qq(scaled_train_data, feature, ax=ax[2])
    plot_feature_vs_target(scaled_train_data, feature, ax=ax[3])

    plt.show()

## Correlation

The features and the target variables all have a low correlation with each other.

In [None]:
corr_matrix = train_data[Config.COLUMNS].corr()

In [None]:
plt.figure(figsize = (20, 15))

sns.heatmap(
    corr_matrix, 
    annot = False, 
    cmap = 'Blues', 
    mask = np.triu(corr_matrix), 
    linewidths = 0.1, 
    linecolor = 'white', 
    cbar = True
)

plt.show()

## Principal component analysis (PCA)

In [None]:
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

In [None]:
n_components = 100
pca = make_pipeline(
    MinMaxScaler(),
    PCA(n_components=n_components, 
        random_state=Config.RANDOM_STATE)
)

pca_cols = ['pc{}'.format(i) for i in range(1, n_components + 1)]
components = pca.fit_transform(train_data[Config.FEATURES])

In [None]:
pca_data = pd.DataFrame({Config.TARGET: train_data[Config.TARGET]})

for i in range(1, n_components + 1):
    pca_data[pca_cols[i-1]] = components[:, i-1]

pca_data = pca_data.sample(frac=0.08, random_state=Config.RANDOM_STATE)

In [None]:
variance = pca['pca'].explained_variance_ratio_
var=np.cumsum(np.round(variance, decimals=3)*100)

fig, ax = plt.subplots(1, 2, figsize=(12, 5))

ax[0].plot(variance)
ax[0].set_xlabel('# of Components')
ax[0].set_ylabel('Explained variance')
ax[0].set_title("PCA Analysis")

ax[1].plot(var)
ax[1].set_ylabel('% Variance Explained')
ax[1].set_xlabel('# of Components')

fig.tight_layout()
fig.show()

In [None]:
def plot_pca(data, x, y, ax=None):
    if ax == None:
        fig, ax = plt.subplots(1, 1)

    sns.scatterplot(
        data=data,
        x=x, 
        y=y,
        hue=Config.TARGET,
        palette='Blues_r',
        alpha=0.4, 
        ax=ax)

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(25, 5))

plot_pca(pca_data, 'pc1', 'pc2', ax=ax[0])
plot_pca(pca_data, 'pc2', 'pc3', ax=ax[1])
plot_pca(pca_data, 'pc3', 'pc4', ax=ax[2])
plot_pca(pca_data, 'pc4', 'pc5', ax=ax[3])

plt.show()

In [None]:
fig, ax = plt.subplots(1, 5, figsize=(25, 5))

plot_pdf(pca_data, 'pc1', ax=ax[0]) 
plot_pdf(pca_data, 'pc2', ax=ax[1]) 
plot_pdf(pca_data, 'pc3', ax=ax[2]) 
plot_pdf(pca_data, 'pc4', ax=ax[3]) 
plot_pdf(pca_data, 'pc5', ax=ax[4]) 

fig.tight_layout()
fig.show()

In [None]:
fig = plt.figure(figsize=(10, 10))

ax = fig.add_subplot(111, projection='3d')
ax.scatter(
    pca_data['pc1'], 
    pca_data['pc2'], 
    pca_data['pc3'],
    alpha=0.8,
    c=pca_data[Config.TARGET],
    cmap='Blues'
)

fig.show()

## Data preprocessing

### Missing values

There are no rows with missing values in both data sets.

In [None]:
pd.DataFrame({
    'data_set': ['train', 'test'],
    'missing_values': [
        train_data.isna().sum().sum(), 
        test_data.isna().sum().sum()
    ]
}).set_index('data_set')

## Modeling

In [None]:
def rmse(y_true, y_pred):
    """RMSE Score
    """
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
score = pd.DataFrame({
    'model': [],
    'rmse': []
}).set_index('model')

In [None]:
X_data = train_data[Config.FEATURES]
y_data = train_data[Config.TARGET]

# spit data into train and validation data sets
X_train, X_vaild, y_train, y_vaild = train_test_split(
    X_data,
    y_data,
    test_size=Config.TEST_SIZE, 
    random_state=Config.RANDOM_STATE
)

In [None]:
print(f'Train size     : {X_train.shape[0]}')
print(f'Validation size: {X_vaild.shape[0]}')

In [None]:
def create_model(regressor):
    """
    """
    model = TransformedTargetRegressor(
        regressor=make_pipeline(
            StandardScaler(),
            PowerTransformer(),
            regressor
        ), 
        transformer=StandardScaler()
    )
    return model

In [None]:
models = [
    ('lr', create_model(LinearRegression())),
    ('ridge', create_model(Ridge(alpha=0.75))),
    ('hgb', create_model(HistGradientBoostingRegressor())),
    ('gb', create_model(GradientBoostingRegressor())),
    ('xgb', create_model(xgb.XGBRegressor())),
    ('rf', create_model(RandomForestRegressor()))
]

weights = [0.1, 0.1, 0.4, 0.1, 0.2, 0.1]

In [None]:
model = VotingRegressor(
    estimators = models,
    weights = weights,
    n_jobs = -1,
    verbose=True
)

y_pred = model.fit(X_train, y_train).predict(X_vaild)
score = rmse(y_vaild, y_pred)

print('RMSE: {}'.format(score))

In [None]:
result_data = pd.DataFrame({
    'pred': y_pred,
    'true': y_vaild
})

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20, 5))

plot_pdf(result_data, 'pred', ax=ax[0])
plot_pdf(result_data, 'true', ax=ax[0])

plot_cdf(result_data, 'pred', ax=ax[1])
plot_cdf(result_data, 'true', ax=ax[1])

plot_feature_vs_target(result_data, 'pred', target='true', ax=ax[2])

plt.show()

## Submission

In [None]:
y_pred_submission = model.predict(test_data[Config.FEATURES])

In [None]:
submission_data = pd.DataFrame({
    Config.INDEX: test_data[Config.INDEX],
    Config.TARGET: y_pred_submission,
}).set_index(Config.INDEX)

submission_data

In [None]:
# save submission file
submission_data.to_csv(Config.SUBMISSION_FILE)