# Intro
Welcome to the monthly Kaggle experiment in 2021. This is [january](https://www.kaggle.com/c/tabular-playground-series-jan-2021). 
![](https://storage.googleapis.com/kaggle-competitions/kaggle/24673/logos/header.png)

<span style="color: royalblue;">Please vote the notebook up if it helps you. Feel free to leave a comment above the notebook. Thank you. </span>

# Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore")

# Path

In [None]:
path = '/kaggle/input/tabular-playground-series-jan-2021/'
os.listdir(path)

# Load Data

In [None]:
train_data = pd.read_csv(path+'train.csv')
test_data = pd.read_csv(path+'test.csv')
samp_subm = pd.read_csv(path+'sample_submission.csv')

# Overview

In [None]:
print('Number train samples:', len(train_data.index))
print('Number test samples:', len(test_data.index))
print('Number features:', len(train_data.columns))

In [None]:
print('Missing values on the train data:', train_data.isnull().sum().sum())
print('Missing values on the test data:', test_data.isnull().sum().sum())

# EDA

Correlation matrix to identify dependencies:

In [None]:
corr = train_data[train_data.columns[1:]].corr()
corr.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

Principal Component Analysis (PCA) is used to reduce the dimension of the dataset. For details we recommend [this tutorial](https://www.kaggle.com/drcapa/iris-species-pca).

If we try to reduce the dimension we could lost accurancy. So we decide not to do that:

In [None]:
pca = PCA().fit(train_data[train_data.columns[1:-1]])
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('No of components')
plt.ylabel('Cumulative explained variance')
plt.grid()
plt.show()

Box plots for every columns to visualize the distribution of the values and identify outliers
Train data:

In [None]:
train_data.boxplot(column = list(train_data.columns[1:-1]), figsize=(12,5))
plt.show()

Test data

In [None]:
test_data.boxplot(column = list(test_data.columns[1:]), figsize=(12,5))
plt.show()

As we can see the distribution of the values in the train and test sets are similiar. Especially in terms of the outliers of feature cont7 and cont9.

# Feature Eningeering
We create statistical features like mean, max and min for every sample on the train and test data.

In [None]:
features = ['cont'+str(i) for i in range(1, 15)]
no_features = ['id', 'target']

In [None]:
train_data['mean'] = train_data[features].mean(axis=1)
train_data['std'] = train_data[features].std(axis=1)
train_data['max'] = train_data[features].max(axis=1)
train_data['min'] = train_data[features].min(axis=1)
train_data['sum'] = train_data[features].sum(axis=1)

test_data['mean'] = test_data[features].mean(axis=1)
test_data['std'] = test_data[features].std(axis=1)
test_data['max'] = test_data[features].max(axis=1)
test_data['min'] = test_data[features].min(axis=1)
test_data['sum'] = test_data[features].sum(axis=1)

In [None]:
train_data.head()

# Set X And y

In [None]:
X = train_data[train_data.columns.difference(no_features)]
y = train_data['target']
X_test = test_data[test_data.columns.difference(no_features)]

Scale Data:

In [None]:
mean = X.mean()
X = X-mean
std = X.std()
X = X/std
X_test = (X_test-mean)/std

# Outlier Detection
(Actually not active)

In [None]:
iso = IsolationForest(contamination=0.01)
yhat = iso.fit_predict(train_data[train_data.columns.difference(no_features)])
mask = yhat != -1
#X, y = X[mask], y[mask]

In [None]:
print('Number of outliers:', len(train_data)-mask.sum())

# Define Train And Val

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=2021)

In [None]:
print('Train shape:', X_train.shape)
print('Val shape:', X_val.shape)
print('Test shape:', X_test.shape)

# Model

In [None]:
model = XGBRegressor(objective='reg:squarederror',
                     booster = "gbtree",
                     eval_metric = "rmse",
                     tree_method = "gpu_hist",
                     n_estimators = 600,
                     learning_rate = 0.04,
                     eta = 0.1,
                     max_depth = 7,
                     subsample=0.85,
                     colsample_bytree = 0.85,
                     colsample_bylevel = 0.8,
                     alpha = 0,
                     random_state = 2021)
model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)
print('Score validation data:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

In [None]:
model

# Analyse Training

Feature importance:

In [None]:
importance = model.feature_importances_
fig = plt.figure(figsize=(10, 6))
x = list(train_data[train_data.columns[1:-1]])
plt.barh(x, 100*importance, color='orange')
plt.title('Feature Importance', loc='left')
plt.xlabel('Percentage')
plt.grid()
plt.show()

Visualization of the error: 

In [None]:
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)

fig, axs = plt.subplots(1, 2, figsize=(22, 6))
fig.subplots_adjust(hspace = .5, wspace=.5)
axs = axs.ravel()
axs[0].plot(y_train, y_train_pred, 'ro')
axs[0].plot(y_train, y_train, 'blue')
axs[1].plot(y_val, y_val_pred, 'ro')
axs[1].plot(y_val, y_val, 'blue')
for i in range(2):
    axs[i].grid()
    axs[i].set_xlabel('true')
    axs[i].set_ylabel('pred')
axs[0].set_title('train')
axs[1].set_title('val')
plt.show()

# Predict Test Data

In [None]:
y_test = model.predict(X_test)

In [None]:
output = samp_subm.copy()
output['target'] = y_test

# Write Output

In [None]:
output.to_csv('submission.csv', index=False)

In [None]:
output.head()