In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
! pip install rich
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from rich import print
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h2 style="color:red;"><center>Objecives</center></h2>
To determine the function that fits the data and predictes loss for new data.

In [None]:
#imports
import numpy as np
import pandas as pd

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#processing
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

<h2 style="color:red;"><center>Understanding Data</center></h2>

In [None]:
#Loading the data
dtrain = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv")
dtest = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv")

In [None]:
#Training dataframe
dtrain.head()

In [None]:
description = dtrain.describe()
description.applymap("{0:.2f}".format)

Observe that the features in the dataset are not in a specifi range so we require feature scaling so that our algorithms converge efficiently.

In [None]:
#list of features
features = dtrain.columns[1:-1]

info_dtrain = dtrain.dtypes
info_dtest = dtest.dtypes

#exclude the id column
int_features = list(filter(lambda x: (x[1]=='int64'), zip(dtrain.columns, info_dtrain)))[1:]
print(int_features)

The loss is an integer between 0 and 42.

Six features are purely integer in the above features.

In [None]:
#Principal Component Analysis of Data
#Before we proceed we will drop the id and loss columns
#and split the set for validation with 30% test size.
x_train, x_test, y_train, y_test = train_test_split(dtrain[features], dtrain.loss, test_size=0.3, random_state=0)

#Standard Scaler
scaler = StandardScaler()

# Fit on training set
scaler.fit(x_train)

# Apply transform to both the training set and the test set.
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
x_dtest = scaler.transform(dtest.drop(["id"], axis=1))

<h3 style="color:red;"><center>Reducing Dimension PCA</center></h3>

In [None]:
#PCA

def reduce_dimension(array, dim=2):
    """
    Defining the outpout size for pca
    """
    
    #Dimension Reduction
    pca = PCA(n_components=dim)
    
    #fit to the train set
    pca.fit(array)
    
    #return the pca object
    return pca

#Call reduce dimension on fatures for reduction to 2 features
pca = reduce_dimension(x_train)
x_train_pca = pca.transform(x_train)

#validation set
y_pca = pca.transform(x_test)

#Actual test set
xtest_pca = pca.transform(x_dtest)

In [None]:
#Build the dataframe from pca arrays
dtrain_pca = pd.DataFrame(np.column_stack((x_train_pca, y_train)), columns=["x_pca", "y_pca", "loss"])
dtest_pca = pd.DataFrame(xtest_pca, columns=["test_x", "test_y"])
dtrain_pca["loss"]=dtrain_pca["loss"].astype(int)
print(dtrain_pca.head())
print(dtest_pca.head())

In [None]:
import warnings
warnings.filterwarnings('ignore')

rows_to_plot = 10000

#Visualizing the 2D data obtaibed through PCA
fig = plt.figure(figsize=(24, 8))
fig.suptitle("Training and test set distributions")

ax = [fig.add_subplot(1, 3, 1), fig.add_subplot(1, 3, 2) ,fig.add_subplot(1, 3, 3, projection="3d")]

#using sequential colomap for loss
ax[2].scatter(dtrain_pca["x_pca"][:rows_to_plot], dtrain_pca["y_pca"][:rows_to_plot], c=dtrain_pca["loss"][:rows_to_plot], cmap="inferno")
ax[2].set_title("3D Plot")

g1 = sns.kdeplot(x="x_pca",y="y_pca",data=dtrain_pca[:rows_to_plot],palette="hls", ax=ax[0])
ax[0].set_title("Train Data")

g2 = sns.kdeplot(x=xtest_pca[:, :1].reshape(1, -1)[0][:rows_to_plot],y=xtest_pca[:, 1:].reshape(1, -1)[0][:rows_to_plot], ax=ax[1])
ax[1].set_title("Test Data");


plt.show()

Both the train data and have similar distributions on pca with 2 components. So we will have less headache while trying to optimize.

In [None]:
#Visualizing the 2D data obtaibed through PCA
g3 = sns.FacetGrid(dtrain_pca,hue="loss", palette="hls",height=8)
g3.map(sns.scatterplot, "x_pca","y_pca").add_legend();

This is not only for beautification purpose but also observe that we have maximum loss at the center and it decreases as we move further. So in 3 dimensions we would have a mountain shaped distributions. We probably have a name for it.

In [None]:
g3 = sns.FacetGrid(dtrain_pca, palette="hls",height=8, aspect=2)
g3.map(sns.kdeplot, "x_pca", color="g")
g3.map(sns.kdeplot, "y_pca", color="b")
g3.map(sns.kdeplot, "loss", color="r")
g3.set(xticks=range(-10, 20, 2))

Observe the distribution we will comback to it.

# Linear Regression

In [None]:
#Invovking linear model
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression().fit(x_train, y_train)

In [None]:
#Cross Validation
predicted_test_loss = linear_model.predict(x_test)

In [None]:
from sklearn.metrics import mean_squared_error as RMSE

def rmse_plot(y_true, y_pred):
    rmse = np.sqrt(RMSE(y_true,y_pred))
    return rmse

def rmse_plots(y_test, predicted_test_loss):
    rmse = np.sqrt((1/len(y_test))*(sum(y_test-predicted_test_loss)))
    fig = plt.figure()
    plt.plot(y_test, predicted_test_loss)

In [None]:
#RMSE ERROR
print(f"The Error for the train set during is observed to be {rmse_plot(y_train, linear_model.predict(x_train))}")
print(f"The Error for the test set during cross vaidation is observed to be {rmse_plot(y_test, predicted_test_loss)}")

RMSE is still "high" very much. We need to bring it down looking at the principal components we will require some non-linear function.

In [None]:
#Now fit the data on PCA
linear_model_pca = LinearRegression().fit(x_train_pca, y_train)
predicted_test_loss_pca = linear_model_pca.predict(y_pca)
print(f"The Error for the test set during cross vaidation on pca is observed to be {rmse_plot(y_test, predicted_test_loss_pca)}")

## Bayesian Ridge

In [None]:
#Invovking linear model Ridge
from sklearn.linear_model import BayesianRidge
bayesridge = BayesianRidge(verbose=True).fit(x_train, y_train)
predicted_loss_ridge = bayesridge.predict(x_test)
print(f"The Error for the test set during cross vaidation with Bayesian Ridge is observed to be {rmse_plot(y_test, predicted_loss_ridge)}")

In [None]:
#Predicting on the unseen value we don't know loss here
y_predicted = bayesridge.predict(x_dtest)
result_f0 = pd.DataFrame({"id":dtest["id"],"loss":y_predicted})
dtest_pca["loss"] = y_predicted

In [None]:
result_f0.to_csv("./submit_bayes.csv", index=False)

# Submission 1

Submission 1 ends with bayes ridge with a score of 7.94038 on 33% dataset.
I'm trying to understand what went not so good.

Bad score larger than previous 7.94038.

We see that loss predicted is offset by certain offset by certain amount. Hints at case of bias. We will add more !!!

In [None]:
# import xgboost as xgb
# xgb_model = xgb.XGBRegressor(reg_lambda=0.5,
#                              max_depth =10,
#                              minimum_child_weight=2
#                              objective="reg:squarederror",
#                              scale_pos_weights=0.5,
#                              random_state=42)

# xgb_model.fit(x_train, y_train, eval_metric='rmse')

# y_pred = xgb_model.predict(x_train)

# rmse_error_train = rmse_plot(y_train, y_pred)

# print(rmse_error_train)

In [None]:
# from scipy.stats import loguniform
# from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold
# from sklearn.linear_model import Ridge
# space = dict()
# space['solver'] = ['svd', 'cholesky', 'lsqr', 'sag']
# space['alpha'] = loguniform(1e-5, 100)
# space['fit_intercept'] = [True, False]
# space['normalize'] = [True, False]
# model = Ridge()
# # define evaluation
# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# # define search
# search = RandomizedSearchCV(model, space, n_iter=500, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv, random_state=1)
# # execute search
# result = search.fit(x_t, y_train)
# # summarize result
# print('Best Score: %s' % result.best_score_)
# print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
# y_pred_test = xgb_model.predict(x_test)

# rmse_plot(y_test, y_pred_test)

In [None]:
# y_pred_dtest = xgb_model.predict(x_dtest)

In [None]:
# g3 = sns.FacetGrid(pd.DataFrame(np.column_stack((xtest_pca, y_pred_dtest)), columns=["test_x","test_y", "predicted_loss"]), palette="hls",height=8, aspect=2)
# g3.map(sns.kdeplot, "test_x", color="g")
# g3.map(sns.kdeplot, "test_y", color="b")
# g3.map(sns.kdeplot, "predicted_loss", color="r")
# g3.set(xticks=range(-10, 20, 2))