# Tabular Playground Series (August 2021)
- This notebook covers my code for the Tabular Playground Series - August challenge, which can be found [here](https://www.kaggle.com/c/tabular-playground-series-aug-2021)
- In this notebook, I have used various EDA techniques, which includes:
    - PCC (Pearson Correlation Coefficient), I have simply eliminated all those features having PCC with 'loss' less than abs(0.005)
    - Using Standard Scaler for the Standardization of all the features
    - PCA (Principal Component Analysis), but it didn't gave any improvement in the results, so didn't used it in the final submission
- As for the training part, I used various models, which includes
    - Gradient Boosted Decision Tree (GBDT)
    - Linear Regression (LR)
    - Histogram Gradient Boosted Regressor
    - Cat Boost Regressor
- If you liked my work, do upvote it :)

# Installing & Importing Packages

In [None]:
!pip install catboost

In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing the Dataset

In [None]:
df_train = pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv")
df_test = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv")
df_sub = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")

In [None]:
print(df_train.shape)
df_train.info(verbose=True, null_counts=True)

In [None]:
print(df_test.shape)
df_test.info(verbose=True, null_counts=True)

In [None]:
# Keeping a separator variable and the target variable
sep = df_train.shape[0]
Y = df_train["loss"]

# Dropping the IDs and the target variable
df_train.drop(["id", "loss"], axis=1, inplace=True)
df_test.drop(["id"], axis=1, inplace=True)

# Concatenating the datasets for pre-processing
df = pd.concat([df_train, df_test], axis=0)

print(df.shape, Y.shape, sep)

# Visualizing & Pre-processing the Dataset
- From the above code cells, we can see that all the features are numerical, and corresponding to every feature, all the values are non-null.

In [None]:
# Plotting the Distribution of 'Loss'
plt.hist(Y, 50, density=True, facecolor='g')
plt.title('Distribution of Loss')
plt.grid(True)
plt.show()

In [None]:
# We are trying to find PCC (Pearson Correlation Coefficient) between features
# So that, we can eliminate some of the redundant features. But for plotting the
# correlation matrix, we will use the training set only.

# Getting the train set
df_train = df.iloc[ : sep, : ]
df_train = df_train.assign(loss = pd.Series(Y))
print(df_train.shape)

# Calculating the PCC
cor_mat = df_train.corr(method='pearson', min_periods=50)
print(cor_mat.shape)

# Number of variables having abs(PCC) with 'loss', less than or equal to 0.005
# We will simply eliminate those features, as they are related with the 'loss', to the minimum extent
red_fea = []
for i, pcc in enumerate(cor_mat['loss']):
    if(-0.005 <= pcc and pcc <= 0.005):
        red_fea.append(cor_mat.index[i])

In [None]:
# Dropping all the Redundant features
df.drop(red_fea, axis=1, inplace=True)
print(df.shape)

In [None]:
# Splitting the df back into df_train and df_test
df_train = df.iloc[ :sep, : ]
df_test = df.iloc[sep: , : ]
print(df_train.shape, df_test.shape)

In [None]:
scaler = StandardScaler()
df_train = scaler.fit_transform(df_train)
df_test = scaler.transform(df_test)
print(df_train.shape, df_test.shape)

In [None]:
# Dimensionality Reduction using PCA
# pca = PCA(n_components=None)
# df_train = pca.fit_transform(df_train)
# df_test = pca.transform(df_test)
# print(df_train.shape, df_test.shape)

# Training the Model

In [None]:
# Defining the Custom Metric
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [None]:
# Splitting the df_train into train & val sets
X_train, X_val, y_train, y_val = train_test_split(df_train, Y, test_size=0.1, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

In [None]:
# Gradient Boosting Regressor Model
# lr, nes, mss, ss = 1, 50, 15, 1
# gbr = GradientBoostingRegressor(
#     learning_rate=lr, n_estimators=nes, min_samples_split=mss, 
#     subsample=ss, verbose=1
# )
# gbr.fit(X_train, y_train)
# y_pred = gbr.predict(X_val)
# print(rmse(y_val, y_pred))

In [None]:
# Linear Regression
# lr = LinearRegression(normalize=True)
# lr.fit(X_train, y_train)
# y_pred = lr.predict(X_val)
# print(rmse(y_val, y_pred))

In [None]:
# Histogram Gradient Boosting Regressor
# lr, mi, md = 0.05, 700, 22
# hgbr = HistGradientBoostingRegressor(
#     learning_rate = lr, max_iter= mi, 
#     max_depth = md, verbose=1,
# )
# hgbr.fit(X_train, y_train)
# y_pred_train = hgbr.predict(X_train)
# y_pred_val = hgbr.predict(X_val)
# print("RMSE on Training Dataset ", rmse(y_train, y_pred_train))
# print("RMSE on Validation Dataset ", rmse(y_val, y_pred_val))

In [None]:
# Cat Boosting Regressor
itr, lr, d = 50, 0.5, 4
cbr = CatBoostRegressor(
    iterations = itr, learning_rate = lr, depth = d,
    custom_metric = 'RMSE', verbose = 1
)
cbr.fit(X_train, y_train)
y_pred_train = cbr.predict(X_train)
y_pred_val = cbr.predict(X_val)
print("RMSE on Training Dataset ", rmse(y_train, y_pred_train))
print("RMSE on Validation Dataset ", rmse(y_val, y_pred_val))

# Submitting the Predictions

In [None]:
# Training the model on the entire df_train
model = CatBoostRegressor(
    iterations = itr, learning_rate = lr, depth = d,
    custom_metric = 'RMSE', verbose = 1
)
model.fit(df_train, Y)

In [None]:
y_test = cbr.predict(df_test)
df_sub['loss'] = y_test
print(df_sub.shape)

In [None]:
df_sub.to_csv("submission.csv", index = False)