In [None]:
## importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split,KFold,RepeatedKFold,cross_val_score,cross_val_predict
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from tqdm import tqdm

In [None]:
## loading the training and testing dataset 
train_data=pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")
test_data=pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")

In [None]:
print("Training dataset")
display(train_data.head())
print("")
print("Testing Dataset")
display(test_data.head())

### Basic Descriptive statistics

In [None]:
print("Number of Rows in the training Dataset :",train_data.shape[0])
print("NUmber of Rows in the Testing Dataset :",test_data.shape[0])

print("Number of Columns in the Training dataset :",train_data.shape[1])
print("Number of Columns in the Testing Dataset :",test_data.shape[1])

In [None]:
## descriptive stats of training dataset 
train_data.describe()

In [None]:
### describing about the test dataset
test_data.describe()

### Checking tthe Missing values 

In [None]:
print("MIssing values in the Training Dataset :",train_data.isna().sum().sum())
print("Total Missing values in the Testing Dataset :",test_data.isna().sum().sum())

In [None]:
## Droping the misteseaus value from the dataset 
train_data.drop(train_data.loc[train_data.target<=0].index,axis=0,inplace=True)

In [None]:
train_data.reset_index(drop=True,inplace=True)

## Exploratory Data Analysis

**Univariate Analysis : Target**
- let see how the target variable is distributed

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(121)
sbn.kdeplot(train_data.target)
plt.title("Distribution of target",size=15)
plt.subplot(122)
sbn.boxplot(train_data.target)
plt.title("Boxplot of Target variable",size=15)

- The distribution is not following the Normal distribution 

**why we care about the predicator , or target variables need to to follow Normal distribution?**
- IT is easier to decribe about the data
- The entire distribution is describe by the two number i.e standard deviation and mean
- it makes maths simple.
- It is good to reduce the outlier effect.


- In training u can use transformation techniques like box-cos, log,sqrt to get a normal distribution 
- At the time predition u can convert by using inverse transormation to get the actual values.

If we are using the linear regression:
- let we have all the predicators and target are following the gaussian distribution .then resultant resgreesion line will suffer from little variance when predicting on unseen data.

So usually if features is follows a close normal distribution then the model will perform well.

### Plot the distribution from count1 to count7 predicators


In [None]:
plt.figure(figsize=(10,5))
for i in range(1,8):
    plt.subplot(240+i)
    sbn.kdeplot(train_data.iloc[:,i])
plt.suptitle("The Distribution first 7 Predicators",size=14)

In [None]:
plt.figure(figsize=(15,7))
for i in range(1,8):
    plt.subplot(240+i)
    sbn.boxplot(train_data.iloc[:,i])
plt.suptitle("The Distribution first 7 Predicators",size=14)

### Plotting the distribution of count8 to count 14

In [None]:
plt.figure(figsize=(10,5))
for i in range(1,8):
    plt.subplot(240+i)
    sbn.kdeplot(train_data.iloc[:,i+7])
plt.suptitle("The Distribution Next 7 Predicators",size=14)

In [None]:
plt.figure(figsize=(15,7))
for i in range(1,8):
    plt.subplot(240+i)
    sbn.boxplot(train_data.iloc[:,i+7])
plt.suptitle("Boxplots of Next 7 Predicators",size=14)

- There are some outliers in the count_7 and count_9 

**Algorithms :**
- Linear regression, svm not robust to outliers
- Tree based algorthim like Gradient boosting, Random Forest are robust to outliers

**If we are planning to use the linear regression or svm then we need to handle the outlier instead of removing the ouliers use some transformation techniques or adjusting the esteeems**


### Treatment of Outliers
- The Predicators cont7 and cont9 has an outliers 
- And also Target variable also got some outliers

Actually Tree based Methods are robust to Ouliers So we are not going to treat them and not sure about Target variable

**Even though Applied some trasformations to change the distribution as well as the adjust the outlier , there will be no use**

### Correlation Matrix

- This will tells how the relation between the features and also it will tells relation betweent the features and Targets.
- If you want to know the relation beween the features visulally you can use the scatter plots

In [None]:
### lets define the correlation between the variables 
corr=train_data.iloc[:,1:].corr().abs()
### heatmap the highly correlated variables
plt.figure(figsize=(12,8))
sbn.heatmap(corr,cmap="Blues_r",annot=True)

### Getting top 10 highly correlated variables

In [None]:
columns=list(corr.index)
corel=corr.values
var1=[]
var2=[]
value=[]
for i in range(len(columns)):
    for j in range(i+1,len(columns)):
        var1.append(columns[i])
        var2.append(columns[j])
        value.append(corel[i,j])
corr_df=pd.DataFrame()
corr_df["var1"]=var1
corr_df["var2"]=var2
corr_df["value"]=value
corr_df.sort_values(ascending=False,by="value",inplace=True)

In [None]:
corr_df.head(10)

- Here the Predicator copunt1 ---> has high correlation with the count12,count9,count10
- and count6--> exhibits high correlation with the count13, count10, count12, count10, count12

**Having the Multicorrelated varible doesn't effect the predictive power of a model .There will effect in the coefficient of the features. SO that if we have a Mulitcorrelated variable we cann't interpret the model properly**

-> so Linear regression or ridge or lasso regressions are not robust with the highly correlated varible

In [None]:
## Lets observe the  relation between the target variable and other features 
corr_df.loc[corr_df.var2=="target"].sort_values(by="value",ascending=False)

- There is no strong relation between the features and target variable.

### Comparing the Distributions of  Features in Training and Testing dataset
- Generally we will make an assumption that the unseen data will come from the same distribution as Training data.
- But in real cases it not possible always 
- Mostly we can see the different distribution of datataset in time series problems. because the data will be varies with the time.

Lets check!!

In [None]:
## LEts checking the first 7 feattures from both the datasets
plt.figure(figsize=(15,6))
for i in range(1,8):
    plt.subplot(240+i)
    sbn.kdeplot(train_data.iloc[:,i],label="train")
    sbn.kdeplot(test_data.iloc[:,i],label="test")
    plt.title(columns[i-1])
plt.suptitle("The Distributions of Features in Training and Testing dataset :",size=14)

In [None]:
plt.figure(figsize=(15,6))
for i in range(1,8):
    plt.subplot(240+i)
    sbn.kdeplot(train_data.iloc[:,i+7],label="train")
    sbn.kdeplot(test_data.iloc[:,i+7],label="test")
    plt.title(columns[i+6])
plt.suptitle("The Distributions of Features in Training and Testing dataset :",size=14)

- Seems to be training dataset and testing dataset are came from the same distributions

**What if we have a different distribution . how to deal with that**

There are different methods to solve this :
using subsampling  https://maxhalford.github.io/blog/subsampling-1/
    - reweighting the training data so that the distribution of training is closer to the distribution of test using Kullback-Leibler Importance Estimation Procedure 

## Feature Engineering
- creating some aggreate features 
- like mean, max , min, standardeviation, kurtosis , median , skew

In [None]:
def Feature_engineering(data):
    Agg_df=pd.DataFrame()
    Agg_df["mean"]=data.mean(axis=1)
    Agg_df["max"]=data.max(axis=1)
    Agg_df["min"]=data.min(axis=1)
    Agg_df["std"]=data.std(axis=1)
    Agg_df["kurtosis"]=data.kurtosis(axis=1)
    Agg_df["median"]=data.median(axis=1)
    Agg_df["skew"]=data.skew(axis=1)
    Agg_df["feat1"]=train_data.cont5+train_data.cont4+train_data.cont11+train_data.cont3
    Agg_df["feat2"]=train_data.cont2+train_data.cont7+train_data.cont11+train_data.cont3
    Agg_df["feat3"]=train_data.cont7*train_data.cont2*train_data.cont3
    Agg_df["feat4"]=train_data.cont7*train_data.cont2*train_data.cont11
    Agg_df["feat5"]=train_data.cont7*train_data.cont2*train_data.cont12
    Agg_df["feat6"]=train_data.cont7*train_data.cont2*train_data.cont6
    return Agg_df


In [None]:
agg_df=Feature_engineering(train_data.iloc[:,1:-1])

In [None]:
plt.figure(figsize=(15,5))
for i in range(9):
    plt.subplot(250+i+1)
    sbn.kdeplot(agg_df.iloc[:,i])
plt.suptitle("The Distribution of Agregated Features",size=14)

- Some of the Features are following the colse to the normal distribution that are mean , skew,std

In [None]:
agg_df.corrwith(train_data.target)

- These features are also doesn't maintain the good relationship with the target variable.

In [None]:
agg_df.corr()

- The variable "skew" and "median" has high correlation
- The "mean" is high correlated with the 25_quantile and 75_quantiles and with median

## Experiment 1: with out adding derived Features

### 1. Decision Trees

In [None]:
import time

In [None]:
x=train_data.drop(columns=["id","target"])
#x=pd.concat([x,agg_df.iloc[:,:]],axis=1)
y=train_data["target"]



In [None]:
### splitting the dataset 
x_train,x_val,y_train,y_val=train_test_split(x,y,test_size=0.2)
print("The shape of the training set :",x_train.shape,y_train.shape)
print("The shape of the validation set :",x_val.shape,y_val.shape)


In [None]:
time.time()

## Algorithm Selection and Model Selection using k-fold

In [None]:
%%time

dt=DecisionTreeRegressor(max_depth=8)
cv = KFold(n_splits=5, random_state=1, shuffle=True)
i=0
score_train=[]
score_test=[]

for train_ind,test_ind in cv.split(x):
    start=time.time()
    i+=1
    print("{}st Fold".format(i))
    x_train=x.iloc[train_ind,]
    y_train=y[train_ind]
    x_val=x.iloc[test_ind,]
    y_val=y[test_ind]
    dt.fit(x_train,y_train)

    y_train_pre=dt.predict(x_train)
    y_val_pre=dt.predict(x_val)
    tr_error=np.sqrt(metrics.mean_squared_error(y_train,y_train_pre))
    te_error=np.sqrt(metrics.mean_squared_error(y_val,y_val_pre))
    score_test.append(te_error)
    score_train.append(tr_error)
    print("-------------->time taken {0:.2f} sec".format(time.time()-start))
    

print("="*50)
print("The Total Training  Error is :",sum(score_train)/len(score_train))
print("The Total validation Error is :",sum(score_test)/len(score_test))



### Random Forest Model Selection

In [None]:
%%time
rf=RandomForestRegressor(max_depth=10,n_estimators=30,n_jobs=-1)
cv = KFold(n_splits=5, random_state=1, shuffle=True)
i=0
score_train=[]
score_test=[]

for train_ind,test_ind in cv.split(x):
    start=time.time()
    i+=1
    print("{}st Fold".format(i))
    x_train=x.iloc[train_ind,]
    y_train=y[train_ind]
    x_val=x.iloc[test_ind,]
    y_val=y[test_ind]
    ## fitting the dataset 
    rf.fit(x_train,y_train)

    y_train_pre=rf.predict(x_train)
    y_val_pre=rf.predict(x_val)
    tr_error=np.sqrt(metrics.mean_squared_error(y_train,y_train_pre))
    te_error=np.sqrt(metrics.mean_squared_error(y_val,y_val_pre))
    score_test.append(te_error)
    score_train.append(tr_error)
    print("-------------->time taken {0:.2f} sec".format(time.time()-start))
    

print("="*50)
print("The Total Training  Error is :",sum(score_train)/len(score_train))
print("The Total validation Error is :",sum(score_test)/len(score_test))


### XGboost Regressor

In [None]:
%%time
xgb=XGBRegressor(n_estimators=4000,learning_rate=0.01,max_depth=3,tree_method='gpu_hist')
cv = KFold(n_splits=5, random_state=1, shuffle=True)
i=0
score_train=[]
score_test=[]

for train_ind,test_ind in cv.split(x):
    start=time.time()
    i+=1
    print("{}st Fold".format(i))
    x_train=x.iloc[train_ind,]
    y_train=y[train_ind]
    x_val=x.iloc[test_ind,]
    y_val=y[test_ind]
    ## fitting the dataset 
    xgb.fit(x_train,y_train)
    
    ## prediction
    y_train_pre=xgb.predict(x_train)
    y_val_pre=xgb.predict(x_val)
    ##
    ## error metric
    tr_error=np.sqrt(metrics.mean_squared_error(y_train,y_train_pre))
    te_error=np.sqrt(metrics.mean_squared_error(y_val,y_val_pre))
    score_test.append(te_error)
    score_train.append(tr_error)
    print("-------------->time taken {0:.2f} sec".format(time.time()-start))
    

print("="*50)
print("The Total Training  Error is :",sum(score_train)/len(score_train))
print("The Total validation Error is :",sum(score_test)/len(score_test))


### Creating a submission File.

In [None]:
regressor = XGBRegressor(
                 colsample_bytree=0.5,
                 alpha=0.01563,
                 #gamma=0.0,
                 learning_rate=0.01,
                 max_depth=10,
                 min_child_weight=257,
                 n_estimators=4000,                                                                  
                 #reg_alpha=0.9,
                 reg_lambda=0.003,
                 subsample=0.7,
                 random_state=2020,
                 metric_period=100,
                 tree_method='gpu_hist',
                 silent=1)

regressor.fit(x_train, y_train, early_stopping_rounds=6, eval_set=[(x_val, y_val)], verbose=1)

In [None]:
#aa=Feature_engineering(test_data.iloc[:,1:])
#x_test=pd.concat([test_data,aa],axis=1)
test_pre=regressor.predict(test_data.iloc[:,1:])

In [None]:
val_pre=regressor.predict(x_train)
mse=metrics.mean_squared_error(y_train,val_pre)
np.sqrt(np.abs(mse))

In [None]:
submit=pd.DataFrame()
submit["id"]=test_data["id"]
submit["target"]=test_pre

In [None]:
submit.to_csv("submission1.csv",index=False)