### Import Libraries / Load Data :

In [None]:
import math
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, learning_curve

from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor 
from catboost import CatBoostRegressor
from mlxtend.regressor import StackingCVRegressor

from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_column', 102)
pd.set_option('display.max_row', 250000)

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv', index_col='id')

sub = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv')

## EDA :

In [None]:
train.shape

In [None]:
train.head()

In [None]:
train.info()

In [None]:
test.info()

 Wow, there is no messing values either in train or test set, so no imputation needed!

In [None]:
train.describe()

The scale of this data is really diverse, the values are in different ranges so we should do some Normalization ( even if it is not necessary for tree-based models ;)

#### Target visualization :

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(train.loss)

In [None]:
train.loss.value_counts()

So strenge!   Our target seems to be a target of a classification task! But indeed we are working with RMSE as our competition's metric wich is for regression tasks.

#### Correlations :

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(14 , 14))

corr = train.corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr, ax=ax,
        square=True, center=0, linewidth=1,
        cmap=sns.diverging_palette(240, 10, as_cmap=True),
        cbar_kws={"shrink": .6},    
        mask=mask
       ) 

ax.set_title(f'Correlation', loc='left', fontweight='bold')     

plt.show()

In [None]:
corr.loss

There is too weak correlation between target and other features!

#### Features distributions :

In [None]:
df = train.append(test).reset_index(drop=True)

In [None]:
df = df.drop(['loss'], axis=1)
columns = df.columns.values

cols = 3
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,100), sharex=False)

plt.subplots_adjust(hspace = 0.3)
i=0

for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            hist1 = axs[r, c].hist(train[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   alpha=0.7,
                                   label="Train Dataset")
            hist2 = axs[r, c].hist(test[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   alpha=0.7,
                                   label="Test Dataset")
            axs[r, c].set_title(columns[i], fontsize=14, pad=5)
            axs[r, c].tick_params(axis="y", labelsize=13)
            axs[r, c].tick_params(axis="x", labelsize=13)
            axs[r, c].grid(axis="y")
            axs[r, c].legend(fontsize=13)
            
        i+=1
#plt.suptitle("Feature values distribution in both datasets", y=0.99)
plt.show();

 So great, The datasets distributions are well balanced.

## Pre-Processing :

#### Scaling data :

In [None]:
ss = StandardScaler()
features = [f'f{i}' for i in range(100)]
train[features] = ss.fit_transform(train[features])
test[features] = ss.transform(test[features])

#### Convert float to int :

 Now let's detect if there is features wich are num not float, then convert it to int.

In [None]:
for col in df.columns:
    if np.array_equal(df[col].values, df[col].values.astype(int)):
        print(col)
        train[col].astype('int')
        test[col].astype('int')

#### Train-Test split :

In [None]:
y = train.loss
X = train.drop('loss', axis = 1)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Modeling and Evaluation :

#### First model : 

In [None]:
eNet = ElasticNet(alpha=0.0005, l1_ratio=0.9)

eNet.fit(x_train,y_train)

print("RMSE", np.sqrt(mean_squared_error(y_test, eNet.predict(x_test))))

#### Second model :

In [None]:
cat_model = CatBoostRegressor(random_state=42,iterations = 2000,learning_rate=0.005, early_stopping_rounds=50)
cat_model.fit(x_train, y_train, verbose = 0)


cat_model.fit(x_train,y_train)

print("RMSE", np.sqrt(mean_squared_error(y_test, cat_model.predict(x_test))))

#### Final model = meta-model :

In [None]:
LGBMReg = LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, 
                           feature_fraction=0.2319, feature_fraction_seed=9,
                           learning_rate=0.05, max_bin=55, min_data_in_leaf=6,
                           min_sum_hessian_in_leaf=11, n_estimators=720, num_leaves=5,
                           bagging_seed=9,objective='regression')


LGBMReg.fit(x_train,y_train)

print("RMSE", np.sqrt(mean_squared_error(y_test, LGBMReg.predict(x_test))))

#### Stacking :

In [None]:
kfold = KFold(n_splits=10)

base_models = (eNet, cat_model)
stack = StackingCVRegressor(regressors=base_models,
                            meta_regressor=LGBMReg, 
                            use_features_in_secondary=True,
                            store_train_meta_features=True,
                            shuffle=False,cv=kfold,
                            random_state=1)

In [None]:
kfold

In [None]:
stack.fit(x_train,y_train)

print("RMSE", np.sqrt(mean_squared_error(y_test, stack.predict(x_test))))

#### Submission :

In [None]:
sub.loss = stack.predict(test)

In [None]:
sub.head()

In [None]:
sub.to_csv('submissio.csv', index=False)

### If you find this notebook useful, please don't forget to upvote it!