In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data preparation/EDA

## **Load and visualize dataset**

In [None]:
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv")
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv")
sub = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv")

In [None]:
#Columns/Rows" count
cols, rows = df_train.shape
print("Number of columns: ", cols)
print("Number of rows: ", rows)

In [None]:
df_train.head()

In [None]:
df_train.describe().T

In [None]:
#Check for null values
df_train.isnull().sum().sum()

In [None]:
df_test.describe().T

In [None]:
# Check for null values at test
df_test.isnull().sum().sum()

In [None]:
#Unique values
print("Number of unique values:\n", df_train.nunique())

## **Exploratory Data Analysis**

In [None]:
#Loss distribution
sns.distplot(df_train['loss'])

In [None]:
#Countplot of loss
plt.figure(figsize = (10, 8))
sns.countplot(data = df_train, x = 'loss',palette = 'icefire')

In [None]:
#Correlation matrix
plt.figure(figsize = (12, 12))
corr = df_train.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='twilight_r', robust=True, center=0,square=True, linewidths=.6)
plt.title("Correlation")
plt.show()

In [None]:
correlations = df_train.corr()['loss'].sort_values()
print("Lowest correlation features:\n")
print(correlations.head(15), "\n")
print("Highest correlation features:\n")
print(correlations.tail(15))

In [None]:
# Plot feature correlations
plt.figure(figsize = (24, 8))
corr["loss"][:-1].plot(kind="bar",grid=True)
plt.title("Features Correlation")

In [None]:
#Drop ID columns
df_train.drop(columns = 'id', inplace = True)
df_test.drop(columns = 'id', inplace = True)

In [None]:
#Each feature distribution
df = pd.concat([df_train.drop(["loss"], axis=1)])
df = df_train.columns[0:100]
plt.subplots(figsize=(20,160))
length = len(df)
for i, j in zip(df, range(length)):
    fig = plt.subplot((length/2), 3, j+1)
    plt.subplots_adjust(wspace=.25, hspace=.6)
    plt.yticks([])
    sns.histplot(x=df_train[i], alpha=0.5,edgecolor="black",color='#3e3b92')
    sns.histplot(x=df_test[i], alpha=0.5,edgecolor="black",color='#00ee6e')
    fig.legend(labels=('Train','Test'))

## **Data split**

In [None]:
# Separate target from features
x = df_train.drop('loss', axis = 1)
y = df_train.loss

In [None]:
#split the validation sets
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2, random_state = 42)

# Checking split 
print('X_train:', x_train.shape)
print('y_train:', y_train.shape)
print('X_val:', x_val.shape)
print('y_val:', y_val.shape)

In [None]:
# Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_val = scaler.transform(x_val)

# Model Training

## **CatBoost**

In [None]:
from catboost import CatBoostRegressor
cat_model = CatBoostRegressor(random_state=42,iterations = 5000,learning_rate=0.005, early_stopping_rounds=50)
cat_model.fit(x_train, y_train, verbose = 0)

In [None]:
# Metric evaluation
from sklearn.metrics import  mean_squared_error
pred_cat = cat_model.predict(x_val)
print("RMSE: ", np.sqrt(mean_squared_error(y_val, pred_cat)))

In [None]:
#Train the whole Dataset
cat_model.fit(x, y, verbose = 0)

In [None]:
#Catboost prediction
y_pred1 = cat_model.predict(df_test)

In [None]:
# Feature impact on model
import shap
impact = shap.Explainer(cat_model)
shap_values = impact(x)
shap.plots.beeswarm(shap_values, max_display = 20)

## **LightGBM**

In [None]:
from lightgbm import LGBMRegressor
LGBModel = LGBMRegressor(random_state=42,n_estimators= 500,learning_rate=0.005, objective='regression', max_depth=5, n_jobs = -1)
LGBModel.fit(x, y, verbose = 0)
pred_lgbm = LGBModel.predict(df_test)
print("RMSE", np.sqrt(mean_squared_error(y, LGBModel.predict(x))))

## **XGBoost**

In [None]:
from xgboost import XGBRegressor
XGBModel = XGBRegressor(random_state=42,n_estimators= 500,learning_rate=0.05,
                      max_depth=8,booster='gbtree',verbosity=0)
XGBModel.fit(x,y)
pred_xgb = XGBModel.predict(df_test)
print("RMSE", np.sqrt(mean_squared_error(y, XGBModel.predict(x))))

# Ensembling
We are going to take the output of the 3 models and calculate a weighted average of them. Which will improve our results

In [None]:
final_predictions = (0.25 * y_pred1) + (0.25 * pred_lgbm) + (0.5 * pred_xgb)

In [None]:
# Organize submission file
sub['loss'] = final_predictions
sub

In [None]:
sub.to_csv("submission.csv", index = False)