# Imports

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

import xgboost as xgb
from xgboost.sklearn import XGBRegressor

# Load Data

In [None]:
base_path = '/kaggle/input/tabular-playground-series-jan-2021/'

train_df = pd.read_csv(base_path+'train.csv', index_col='id').assign(_set='train')
test_df = pd.read_csv(base_path+'test.csv', index_col='id').assign(_set='test')

In [None]:
train_df.head()

# EDA

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.histplot(data=train_df, x='target', kde=True, ax=ax)

In [None]:
features = ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8',
       'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']

n=len(features)

fig, axs = plt.subplots(n,1, figsize=(10,5*n))

for f, ax in zip(features, axs):
    sns.histplot(data=train_df, x=f, stat='probability', kde=True, ax=ax, alpha=0.5, label='train')
    sns.histplot(data=test_df, x=f, stat='probability', kde=True, ax=ax, color='green', alpha=0.5, label='test')
    ax.legend()

In [None]:
corr = train_df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(
    data=corr,
    mask=mask,
    annot=True,
    fmt='.2f',
    linewidths=1,
    square=True,
    ax=ax
)

# Model

## Simple Model (Linear Regression)

In [None]:
num_features = ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8',
       'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']
target = 'target'

X = train_df[num_features]
y = train_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
predict_train = model.predict(X_train)
predict_test = model.predict(X_test)

print(
    f"RMSE Train: {(mean_squared_error(y_train, predict_train))**(.5)}\n",
    f"RMSE Test: {(mean_squared_error(y_test, predict_test))**(.5)}"
)

## XGBoost

In [None]:
model = xgb.XGBRegressor()
model.fit(X_train, y_train)

In [None]:
predict_train = model.predict(X_train)
predict_test = model.predict(X_test)

print(
    f"RMSE Train: {(mean_squared_error(y_train, predict_train))**(.5)}\n",
    f"RMSE Test: {(mean_squared_error(y_test, predict_test))**(.5)}"
)

In [None]:
params = {
    'objective': 'reg:squarederror',
    'n_estimators': 1000,
    'lambda': 7.610705234008646, 
    'alpha': 0.0019377246932580476, 
    'colsample_bytree': 0.5, 
    'subsample': 0.7, 
    'learning_rate': 0.012, 
    'max_depth': 20, 
    'random_state': 24, 
    'min_child_weight': 229,
    'random_state':42
}


reg = XGBRegressor(**params)

model = Pipeline([
    ('scaler', StandardScaler()),
    ('reg', reg)
])

model.fit(X_train, y_train)

In [None]:
predict_train = model.predict(X_train)
predict_test = model.predict(X_test)

print(
    f"RMSE Train: {(mean_squared_error(y_train, predict_train))**(.5)}\n",
    f"RMSE Test: {(mean_squared_error(y_test, predict_test))**(.5)}"
)

# Output

In [None]:
test_df['target'] = model.predict(test_df[num_features])
test_df.head()

In [None]:
test_df[['target']].to_csv('./final_kaggle.csv')

# Shap

In [None]:
import shap
shap.initjs()

# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
explainer = shap.TreeExplainer(model['reg'])
shap_values = explainer.shap_values(X)

In [None]:
shap.summary_plot(shap_values, X)