# Tabular Playground Series - Jan 2021

## If you have any suggestions feel free to leave a comment !

# Setup

In [None]:
# Data Manipulation
import pandas as pd
import numpy as np

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Models
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
# Read data
train = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv', index_col='id')
test = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv', index_col='id')

In [None]:
# Predictors & Target
predictors = train.columns[:-1]
target = train.columns[-1]

In [None]:
# Styling
plt.style.use('ggplot')
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 'large'

# Exploration

In [None]:
# Size
print('Train set shape:', train.shape)
print('Test set shape:', test.shape)

In [None]:
# Missing data
print('Missing values on the train data:', train.isnull().sum().sum())
print('Missing values on the test data:', test.isnull().sum().sum())

In [None]:
# Duplicated data
print('Duplicated rows on the train data:', train.duplicated().sum())
print('Duplicated rows on the test data:', test.duplicated().sum())

## Univariate Analysis

### Target

In [None]:
# Target
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True)
ax1.title.set_text('Target Distribution')
sns.distplot(train[target], ax=ax1)
sns.boxplot(train[target], orient='h', ax=ax2);

### Predictors

In [None]:
# Distribution in test set
plt.figure(figsize=(10, 5))
plt.title('Distribution of predictors')
sns.boxplot(data=pd.melt(train[predictors]), x='variable', y='value');

In [None]:
# Distribution curve
fig, axs = plt.subplots(7, 2, figsize=(12, 12))
for ax, pred in zip(axs.flatten(), predictors):
    sns.distplot(train.loc[:, pred], ax=ax)
plt.tight_layout()

## Bivariate Analysis

### Correlation

In [None]:
# Correlation
corr = train.corr()
plt.figure(figsize=(10, 10))
plt.title('High Correlation - greater/lower than +/- 60%')
sns.heatmap(corr[abs(corr) > 0.6], annot=True, cmap="YlGnBu", square=True, linewidths =.5);

### Link between predictor and target

In [None]:
# Scatter plot
fig, axs = plt.subplots(7, 2, figsize=(14, 16))
for ax, pred in zip(axs.flatten(), predictors):
    train.plot.hexbin(x=pred, y=target, gridsize=(80, 20), ax=ax)
plt.tight_layout()

# Data Cleaning

## Remove outliers

In [None]:
# Remove observations with +/- 1.5 IQR
# Quantiles & IQR
q1 = train.quantile(0.25)
q3 = train.quantile(0.75)
iqr = q3 - q1

# Selection
mask = (train >= (q1 - 1.5*iqr)) & (train <= q3 + 1.5*iqr)
train = train[mask.apply(all, axis=1)]

print('Train set without outliers shape:', train.shape)

# Model XGBoost

## Split data

In [None]:
# Split ratio 0.2
X_train, X_val, y_train, y_val = train_test_split(train[predictors], 
                                                  train[target], 
                                                  test_size = 0.2, 
                                                  random_state=2021)

## Define, train and test XGB model

In [None]:
# XGB
model = XGBRegressor(objective='reg:squarederror',
                     booster = "gbtree",
                     eval_metric = "rmse",
                     tree_method = "gpu_hist",
                     n_estimators = 1000,
                     learning_rate = 0.04,
                     eta = 0.1,
                     max_depth = 7,
                     subsample=0.85,
                     colsample_bytree = 0.85,
                     colsample_bylevel = 0.8,
                     alpha = 0,
                     random_state = 2021)

In [None]:
# Fit mode
%time model.fit(X_train, y_train)

In [None]:
# Test
y_val_pred = model.predict(X_val)
print('Validation Set RMSE:', np.sqrt(mean_squared_error(y_val, y_val_pred)))

# Submission

In [None]:
# Make predictions
test['target'] = model.predict(test[predictors])

# Save
test['target'].to_csv('submission.csv')