#### Start

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

import time
import os


In [None]:
##import the data
train = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")


In [None]:
train.head()

In [None]:
test.head()

In [None]:
print('train data shape:',train.shape)
print('test data shape:',test.shape)


In [None]:
##misssing values count
print("train x data null value sum:", train.isnull().sum().sum())
print("train y data null value sum:", test.isnull().sum().sum())

> ####  Now I have basic understanding of how data looks like. Then I would like to check if exists any correlations among 14 features 

In [None]:
## Separate features and targets 
df_x = train.iloc[:,1:15]
df_y = train.target

In [None]:
# correlation map
corr_data = df_x.corr()

plt.figure(figsize=(10,10))
sns.heatmap(corr_data,square = True,vmax = 0.8)

> #### Seems feature 11 and 12 have strong correlations, maybe can do some explorations later

> #### If exists outliers

In [None]:
plot = plt.boxplot(df_x.T)

In [None]:
plot = plt.boxplot(test.iloc[:,1:].T)

#### handle outliers

In [None]:
def IQR(dist):
    return np.percentile(dist, 75) - np.percentile(dist, 25)

def handle_outliers(series):
    
    IQR_data = IQR(series)
    percentile_75 = np.percentile(series, 75)
    percentile_25 = np.percentile(series, 25)
    
    for i in range(series.shape[0]):
        
        if series[i] > percentile_75 + 1.5*IQR_data:
            series[i] = percentile_75 + 1.5*IQR_data
            
        if series[i] < percentile_25 - 1.5*IQR_data:
            series[i] = percentile_25 - 1.5*IQR_data
            
    return series

            
df_x['cont7'] = handle_outliers(df_x['cont7'] )
df_x['cont9'] = handle_outliers(df_x['cont9'] )

test['cont7'] = handle_outliers(test['cont7'] )
test['cont9'] = handle_outliers(test['cont9'] )    
    
    

> #### distribution 

In [None]:
%matplotlib inline
column_reshape = np.array(df_x.columns).reshape(2,7)

fig, ax = plt.subplots(2,7,figsize = (20,5))
for i in range(ax.shape[0]):
    for j in range(ax.shape[1]):
        plot = ax[i,j].hist(df_x[column_reshape[i,j]],bins = 50, density = True, color = 'purple')
        ax[i,j].set_title(column_reshape[i,j])
        ax[i,j].axis('off')


In [None]:
##decrease Skewness
df_x['cont7'] = np.log1p(df_x['cont7'])
df_x['cont11'] = np.log1p(df_x['cont11'])
df_x['cont12'] = np.log1p(df_x['cont12'])

test['cont7'] = np.log1p(test['cont7'])
test['cont11'] = np.log1p(test['cont11'])
test['cont12'] = np.log1p(test['cont12'])

## Train XG Boost model

In [None]:
##Split the train and validate data set
trainX, testX, trainY, testY = train_test_split(df_x,df_y,test_size=0.18, random_state=2021)

In [None]:
ts = time.time()

model = XGBRegressor(
    max_depth=20,
    n_estimators=300,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    trainX, 
    trainY, 
    eval_metric="rmse", 
    eval_set=[(testX, testY)], 
    verbose=True, 
    early_stopping_rounds = 30)

time.time() - ts

In [None]:
prediction_xgb  = model.predict(test.iloc[:,1:])

results = pd.Series(prediction_xgb,name="target")

submission = pd.concat([test.iloc[:,0],results],axis = 1)

submission.to_csv("submission_xgb.csv",index=False)