In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> Table of Contents </h1>

* [1) Import Required Libraries](#1)

* [2) Read Data](#2)

* [3) EDA](#3)

 * [3.1) AutoViz](#3.1)
 
* [4) Feature Engineering](#4)

 * [4.1) Missing Values](#4.1)
 
 * [4.2) Outliers](#4.2)

* [5) Model building and Evaluation](#5)

 * [5.1) XGBoost Regressor](#5.1)

* [6) AutoML](#6)

### INTRODUCTION

Kaggle competitions are incredibly fun and rewarding, but they can also be intimidating for people who are relatively new in their data science journey. In the past, Kaggle have launched many Playground competitions that are more approachable than Featured competition, and thus more beginner-friendly.

The goal of these competitions is to provide a fun, but less challenging, tabular dataset. These competitions will be great for people looking for something in between the Titanic Getting Started competition and a Featured competition.

The dataset is used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with calculating the loss associated with a loan defaults. Although the features are anonymized, they have properties relating to real-world features.

Submissions are evaluated using Submissions are scored on the root mean squared error. RMSE is defined as:

RMSE=1n∑i=1n(yi−yi^)2−−−−−−−−−−−−√
 
where  yi^  is the predicted value,  y  is the ground truth value, and  n  is the number of rows in the test data.

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 1) Import Required Libraries </h1>

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

plt.style.use("fivethirtyeight")
sns.set_style("darkgrid")

In [1]:
from sklearn.model_selection import KFold, cross_validate, train_test_split
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn import metrics
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

from sklearn import model_selection
from sklearn.ensemble import StackingRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 2) Read Data </h1>

In [1]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv")
submission = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv")

In [1]:
display(train.head())
display(test.head())
display(submission.head())

In [1]:
display(train.shape)
display(test.shape)
display(submission.shape)

In [1]:
display(train.info())
display(test.info())

In [1]:
train_original = train.copy()
test_original = test.copy()

In [1]:
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [1]:
train.describe()

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 3) EDA </h1>

### Target Column

In [1]:
print('Target column basic statistics:')
train['loss'].describe()

In [1]:
print("total unique values for loss:", train['loss'].nunique())
print("\n\n Unique values:\n\n", train['loss'].value_counts())

In [1]:
train['loss'].quantile([0.25, 0.5 , 0.75,0.90])

In [1]:
plt.figure(figsize=(15,9))
sns.countplot(data=train, x='loss');

In [1]:
fig, ax = plt.subplots(figsize=(12, 8))

sns.kdeplot(train['loss'], shade=True, color='blue', Label='Target-loss')
plt.title("Distribution of the Target", fontsize="large", fontweight="bold", size=20)

# Setting the X and Y Label 
plt.xlabel('Target Loss') 
plt.ylabel('Probability Density')

fig.text(
    0.4,
    0.5,
    """
The target has a skewed distribution.
""",
    bbox=dict(boxstyle="round", fc="#009473"),
    fontsize="medium",
)

plt.show()

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.1) Relation between Features </h1>

In [1]:
plt.figure(figsize = (12, 12))
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='twilight_r', robust=True, center=0,square=True, linewidths=.6)
plt.title("Correlation")
plt.show()

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.2) The correlation between this continuos features and the target </h1>

In [1]:
correlations = train.corr()['loss'].sort_values()
print("Lowest correlation features:\n")
print(correlations.head(15), "\n")
print("Highest correlation features:\n")
print(correlations.tail(15))

In [1]:
# Plot feature correlations
plt.figure(figsize = (24, 8))
corr["loss"][:-1].plot(kind="bar",grid=True)
plt.title("Features Correlation")

In [1]:
train.corr()['loss'].sort_values(ascending=False).head(50)
df_corr_train = train.corr()['loss'].sort_values(ascending=False)

plt.figure(figsize=(18,25))
df_corr_train.plot(kind='barh')
plt.show()

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 3.3) Skewness and Kurtosis </h1>

In [1]:
# Checking the skewness of "f77" attributes
plt.figure(figsize=(7,5))
sns.distplot(train['f77'])
Skew_f77 = train['f77'].skew()
plt.title("Skew:"+str(Skew_f77), size=15, fontweight='bold')
plt.show()

In [1]:
skew_feats = train.skew().sort_values(ascending=False).head(50)

skewness = pd.DataFrame({'Skew':skew_feats})
skewness

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 4) Feature Engineering </h1>

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 4.1) Missing Values </h1>

In [1]:
# Listing Number of missing values by feature column wise
total = train.isnull().sum().sort_values(ascending=False)
total = total[total > 0]
total

In [1]:
train.isnull().sum().sum()

In [1]:
train.duplicated().value_counts()

In [1]:
test.duplicated().value_counts()

In [1]:
# Remove all duplicates
train.drop_duplicates(inplace=True)
print("Dataset size before removing duplicates:", train_original.shape)
print("Dataset size after removing duplicates:", train.shape)

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left; border-radius: 0px 0px;"> 4.2) Outliers </h1>

In [1]:
plt.figure(figsize=(22,55))
sns.boxplot(data=train, orient="h");
plt.xscale('log')

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 5) Model building and Evaluation </h1>

In [1]:
X = train.drop('loss', axis = 1)
y = train['loss']

In [1]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

<h1 style="background-color:LightBlue; font-family:newtimeroman; font-size:180%; text-align:left; border-radius: 0px 0px;"> 5.1) LightGBM </h1>

In [1]:
LGBM = LGBMRegressor(learning_rate= 0.07, max_depth= 8, n_estimators= 200, objective='regression', n_jobs = -1)
LGBM.fit(X_train, y_train)

In [1]:
pred_lgbm = LGBM.predict(X_test)
pred_lgbm_test = LGBM.predict(test)

In [1]:
rmse_LGBM = mean_squared_error(y_test, pred_lgbm, squared=False)
print('MSE score: ', rmse_LGBM)

<h1 style="background-color:LightBlue; font-family:newtimeroman; font-size:180%; text-align:left; border-radius: 0px 0px;"> 5.2) XGRegressor </h1>

## Baseline

In [1]:
XGB = XGBRegressor(learning_rate = 0.05, 
                   n_estimators = 200,
                   min_child_weight = 11,)
XGB.fit(X_train, y_train)

In [1]:
pred_xgb = XGB.predict(X_test)
pred_xgb_test = XGB.predict(test)

In [1]:
rmse_xgb = mean_squared_error(y_test, pred_xgb, squared=False)
print('MSE score: ', rmse_xgb)

In [1]:
fig = plt.figure(figsize=(24,34))
ax = plt.axes()
xgb.plot_importance(XGB, ax)

<h1 style="background-color:LightBlue; font-family:newtimeroman; font-size:180%; text-align:left; border-radius: 0px 0px;"> 5.3) CatBoost Regressor </h1>

In [1]:
Cat = CatBoostRegressor(random_state=42,iterations = 5000,learning_rate=0.005, early_stopping_rounds=50)
Cat.fit(X_train, y_train, verbose = 0)

In [1]:
pred_cat = Cat.predict(X_test)
pred_cat_test = Cat.predict(test)

In [1]:
rmse_cat = mean_squared_error(y_test, pred_cat, squared=False)
print('MSE score: ', rmse_cat)

## Ensemble

In [1]:
ensembled = pred_cat_test*0.4 + pred_lgbm_test *0.4 + pred_xgb_test *0.2

In [1]:
submission['loss'] = ensembled
submission

In [1]:
submission.to_csv("submissionensemble.csv", index = False)

### Stacking

In [1]:
# Define base learners
myclf1 = LGBMRegressor()
myclf2 = XGBRegressor()
myclf3 = CatBoostRegressor()

In [1]:
# Define meta model
mylr = CatBoostRegressor()

In [1]:
from mlxtend.regressor import StackingCVRegressor
stack = StackingCVRegressor(regressors=(myclf1, myclf2, myclf3), meta_regressor= mylr, use_features_in_secondary=True)

stack.fit(X_train, y_train)

In [1]:
pred_stack = stack.predict(X_test)
pred_stack_test = stack.predict(test)

In [1]:
rmse_stack = mean_squared_error(y_test, pred_stack, squared=False)
print('MSE score: ', rmse_stack)

In [1]:
submission['loss'] = pred_stack_test
submission

In [1]:
submission.to_csv("submissionstack.csv", index = False)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 6) AutoML </h1>

In [1]:
import datatable as dt

train_ML = dt.fread("/kaggle/input/tabular-playground-series-aug-2021/train.csv")
test_ML = dt.fread("/kaggle/input/tabular-playground-series-aug-2021/test.csv")

In [1]:
!python3 -m pip install -q lightautoml

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

In [1]:
target = train_ML['loss'].to_numpy().ravel()

del train_ML[:, ['id', 'loss']]
test_ids = test_ML[:, 'id']
test = test_ML[:, train_ML.names]

In [1]:
train_ML['target'] = dt.Frame(target)

model = TabularAutoML(task=Task('reg'), timeout=500, verbose=2)

model.fit_predict(train_data=train_ML.to_pandas(), roles={'target': 'target'})

del train_ML['target']

In [1]:
preds = model.predict(test_ML.to_pandas()).data.ravel()

In [1]:
submission = dt.Frame(id=test_ids, loss=preds)

submission.head()

In [1]:
submission.to_csv("submission_AutoML.csv")