# Tabular Playground Series (Regression)

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import scipy.stats as st
from matplotlib import pyplot as plt

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Reading Dataset

In [None]:
data = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/train.csv")
test_data = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/test.csv")
sample = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv")

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

There is no missing value in data

## Data Preprocessing

In [None]:
data = data.drop('id', axis = 1)
test_data = test_data.drop('id', axis = 1)

In [None]:
features = ['cont1','cont2','cont3','cont4','cont5','cont6','cont7','cont8','cont9','cont10','cont11','cont12','cont13','cont14']

### Outliers

In [None]:
for feature in features:
    sns.boxplot(x=data[feature])
    plt.title(feature)
    plt.show();

sns.boxplot(x=data['target'])

We can see some outliers in cont7, cont 9, cont 10 and target.

In [None]:
# Using Inter Quartile Range to detect outliers

Quart1 = data.quantile(0.25)
Quart3 = data.quantile(0.75)
IQR = Quart3 - Quart1
((data < Quart1 - 1.5 * IQR) | (data > Quart3 + 1.5 * IQR)).sum()

In [None]:
# Counting Outliers

data_out = data[((data < Quart1 - 1.5*IQR) | (data > Quart3 + 1.5*IQR)).any(axis=1)]
print('Number of outliers: %d '%(data_out.shape[0]))
print('i.e. %0.3f percent of total data'%(data_out.shape[0]/data.shape[0]*100))

**Since we have only 2.7% points as outliers, we can drop them.**

In [None]:
# Deleting Outliers

data = data[~((data < Quart1 - 1.5*IQR) | (data > Quart3 + 1.5 * IQR)).any(axis=1)]
data.shape

## Exploratory Data Analysis

### Feature Comparasion

In [None]:
fig, ax = plt.subplots(figsize=(16,12)) 
sns.heatmap(data.corr(), annot=True, fmt='0.2f',linewidths=1,  ax=ax)

Here, we can't find any high co-relation between features. Max is 0.83 between cont11 and cont12. 

#### Analyzing Target column

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
sns.distplot(data['target'], ax=ax[0])
sns.boxplot(data['target'], ax=ax[1])

import warnings
warnings.filterwarnings("ignore")

Value of Target roughly lies between in 6-11 

#### Analyzing Features

In [None]:
def CdfPlot(df, feature):

    xaxis = np.sort(df)
    yaxis = np.linspace(0, 1, len(df))

    plt.plot(xaxis, yaxis * 100, 'r', label="CDF")
    plt.ylabel("Percentile")
    plt.legend()
    plt.title(feature)
    plt.show();

for feature in features:
    fig, ax = plt.subplots(1, 2, figsize=(16, 6))
    sns.distplot(data[feature], ax=ax[0])
    CdfPlot(data[feature], feature)


**Observation**:

* coun2 seems to be more like classification than regression in 0.4 - 0.8 region. 
* Some features like cont5, cont10 have most of their values concentrated in a small region.
* cont5 is approximately linear after 0.4.

### Comparing Train and Test Datasets 

In [None]:
fig, axes = plt.subplots(7, 2, figsize=(15,15))
for ax, col in zip(axes.ravel(), data.columns[:-1]):
    sns.distplot(data[col], label='train', ax=ax)
    sns.distplot(test_data[col], label='test', ax=ax)

    ax.legend()
    plt.tight_layout()
plt.show()


* Both Train and test dataset distributions are similar

# Training model

## Getting Test-train

In [None]:
x = data.drop('target', axis = 1)
y = data['target']

x_test = test_data



# from sklearn.model_selection import train_test_split

# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)





In [None]:
x_test.shape

### Using Linear regression

In [None]:
from sklearn.model_selection import train_test_split

x_train, xtest, y_train, ytest = train_test_split(x, y, test_size=0.2)

In [None]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression()
clf.fit(x_train, y_train)

In [None]:
 y_pred = clf.predict(xtest)

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(ytest, y_pred,squared=False)


### XGBoost

In [None]:
x = data.drop('target', axis = 1)
y = data['target']



# from sklearn.model_selection import train_test_split

# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# x_train = x_train.head(10000)
# y_train = y_train.head(10000)

In [None]:

grid_params = {
    'n_estimators' : [500, 1000, 1500, 2000],
}

In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV


xgb = XGBRegressor(
        objective = 'reg:squarederror',
        learning_rate = 0.01,
        subsample = 0.5,
        colsample_bytree = 0.8,
        max_depth= 5,
        tree_method = 'hist'
        )


In [None]:
grid_search = GridSearchCV(
            xgb,
            param_grid = grid_params, 
            scoring = 'neg_root_mean_squared_error', 
            n_jobs = 1,
            verbose = 2
            )

grid_search.fit(x, y)

print(grid_search.best_estimator_)

print('\n Best score: %f'%(grid_search.best_score_))

print('\n Best n_estimator value:')
print(grid_search.best_params_)

We get {n_estimator = 2000} as the best hyperparameter

### Training XGBoost

In [None]:
xgb = XGBRegressor(
    
    objective = 'reg:squarederror',
    n_estimators= 2000,
    learning_rate = 0.01,
    subsample = 1,
    colsample_bytree = 0.8,
    max_depth= 5,
    tree_method = 'hist')

In [None]:
xgb.fit(x, y)


In [None]:
y_pred = xgb.predict(x_test)

In [None]:
# from sklearn.metrics import mean_squared_error
# mean_squared_error(y_test, y_pred,squared=False)

In [None]:
y_pred

## Submission

In [None]:
submission = pd.DataFrame()
submission['id'] = sample['id']
submission['target'] = y_pred

In [None]:
submission.to_csv('submission.csv', index=False)