In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns 
import math
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.ensemble import StackingRegressor

from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm

from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
train = pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
sum(train.isnull().sum())

## So, there are no missing values

In [None]:
train.drop(['id'], axis=1, inplace=True)

In [None]:
new=test["id"]
test.drop(['id'], axis=1, inplace=True)

In [None]:
target= train["loss"].value_counts()
target

In [None]:
sns.distplot(train["loss"])

In [None]:
df = pd.concat([train.drop(["loss"], axis=1), test], axis=0)
columns = df.columns.values

cols = 3
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,100), sharex=False)

plt.subplots_adjust(hspace = 0.4)
i=0

plt.title("Feature values distribution in both datasets")
for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            hist1 = axs[r, c].hist(train[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="deepskyblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train Dataset")
            hist2 = axs[r, c].hist(test[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="palevioletred",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Test Dataset")
            axs[r, c].set_title(columns[i], fontsize=14, pad=5)
            axs[r, c].tick_params(axis="y", labelsize=13)
            axs[r, c].tick_params(axis="x", labelsize=13)
            axs[r, c].grid(axis="y")
            axs[r, c].legend(fontsize=11)
                                  
        i+=1
plt.show();

In [None]:
df = pd.concat([train.drop(["loss"], axis=1), test], axis=0)
columns = df.columns.values

cols = 3
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,100), sharex=False)

plt.subplots_adjust(hspace = 0.4)
i=0

plt.title("Features and loss")
for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            hist1 = axs[r, c].hist(train[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="deepskyblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train Dataset feature")
            hist2 = axs[r, c].hist(train["loss"].values,
                                   bins=40,
                                   color="palevioletred",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Loss")
            axs[r, c].set_title(columns[i], fontsize=14, pad=5)
            axs[r, c].tick_params(axis="y", labelsize=13)
            axs[r, c].tick_params(axis="x", labelsize=13)
            axs[r, c].grid(axis="y")
            axs[r, c].legend(fontsize=11)
                                  
        i+=1
plt.show();

In [None]:
train.dtypes.value_counts()

In [None]:
train.describe()

# Overview of test set

In [None]:
test.head()

In [None]:
test.dtypes.value_counts()

In [None]:
test.describe()

In [None]:
# Train data
X=train.drop(columns = ['loss'])
y=train['loss'].values
# Test data
X_test=test
print('Train set:', X.shape)
print('Test set:', X_test.shape)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, shuffle = True)

# LightGBM

In [None]:
# Commented to save time
'''
grid_n_estimator = [200,310,330,350]
grid_learn = [.01,0.03,0.05]

LGBM = LGBMRegressor()
G_LGBM = GridSearchCV(LGBM, param_grid= {'learning_rate': grid_learn,
                                          'n_estimators': grid_n_estimator,
                                          'max_depth': [8,10]},)

G_LGBM.fit(X_train, Y_train)
print('Best Parameters: ', G_LGBM.best_params_)
'''

In [None]:
LGBM = LGBMRegressor( learning_rate= 0.07,
                      max_depth= 8, 
                      n_estimators= 200,
                      objective='regression',
                      n_jobs = -1)
LGBM.fit(X_train, Y_train)
predlgbm = LGBM.predict(X_test)

In [None]:
from sklearn import metrics

rmse = metrics.mean_squared_error(Y_test, predlgbm, squared=False)
print('MSE score: ', rmse)

In [None]:
predictionLGBM = LGBM.predict(test)

# XGBoost

In [None]:
'''
grid_n_estimator = [200,400, 500]
grid_learn = [.001,0.03,0.05]

XGB = XGBRegressor()
G_XGB = GridSearchCV(XGB, param_grid= {'learning_rate': grid_learn, 
                                        'n_estimators': grid_n_estimator
                                        })

G_XGB.fit(X_train, Y_train)
print('Best Parameters: ', G_XGB.best_params_)
'''

In [None]:
XGB = XGBRegressor( learning_rate= 0.05, 
                      n_estimators= 200,
                      min_child_weight =11,
                      )
XGB.fit(X_train, Y_train)
pred = XGB.predict(X_test)

In [None]:
from sklearn import metrics

rmse = metrics.mean_squared_error(Y_test, pred, squared=False)
print('MSE score: ', rmse)

In [None]:
predictionXGB = XGB.predict(test)

# CatBoost

In [None]:
Cat = CatBoostRegressor(learning_rate=0.07, depth=6)

Cat.fit(X_train, Y_train)
predcat = Cat.predict(X_test)

In [None]:
from sklearn import metrics

rmse = metrics.mean_squared_error(Y_test, predcat, squared=False)
print('MSE score: ', rmse)

In [None]:
predictioncat = Cat.predict(test)

In [None]:
ensembled = predictioncat*0.4 + predictionLGBM *0.4 + predictionXGB *0.2

In [None]:
predictions = pd.DataFrame()
predictions["id"] = new
predictions["loss"] = ensembled

predictions.to_csv('submissionensemb3.csv', index=False, header=predictions.columns)
predictions.head()