# Instantiation

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('fivethirtyeight')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# Data input and exploration

In [None]:
test = pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")
train = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")

In [None]:
fig,ax = plt.subplots(1,2, figsize=(20,6))
sns.distplot(train.target, ax=ax[0])
sns.boxplot(train.target, ax=ax[1], color="maroon", saturation=4.6)
plt.show()

## Outlier removal

Removiing outliers which are more than 1.5* 3rd quantile and less than 1.5* 1st quartile

In [None]:
outlier_band = (np.quantile(train.target,0.75) - np.quantile(train.target,0.25))*1.5
low, high = np.quantile(train.target,0.25) - outlier_band, np.quantile(train.target,0.75) + outlier_band 
train = train[ (train.target>low) & (train.target<high)]

In [None]:
fig,ax = plt.subplots(1,2, figsize=(20,6))
sns.distplot(train.target, ax=ax[0], color="green")
sns.boxplot(train.target, ax=ax[1], color="gold", saturation=0.6)
plt.show()

## Correlation checking between features and target variable

In [None]:
corr=train.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
fig, ax = plt.subplots(figsize=(18,12)) 
sns.heatmap(corr,cmap="RdPu", mask=mask, annot=True)

As we can see from the correlation matrix some of the features are correlated. We need to something about them :(

# Feature engg

**Let's have a look under the hood of our ML predictors, shall we?**

Using mutual_info_regression to capture even non linear-relation between target and predictor variables

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
fs = SelectKBest(score_func=mutual_info_regression, k="all")
fs.fit(train.drop(['target'],axis=1), train.target)
X_n = fs.transform(train.drop(['target'],axis=1))

In [None]:
score = pd.concat([pd.DataFrame(train.columns),pd.DataFrame(fs.scores_)],axis=1)
score.columns = ["feature","scores"]
score = score.sort_values("scores", ascending=False)
score = score[score.feature != "target"]
sns.barplot(x=score.scores, y=score.feature)
plt.title("Importance of features")

### Let's have a look at the variance of each feature

In [None]:
fig,axes = plt.subplots(5,3, figsize=(30,30))
sns.boxplot(train.id, ax=axes[0,0], color="yellow", saturation=0.75)
sns.boxplot(train.cont1, ax=axes[0,1], color="yellow", saturation=0.75)
sns.boxplot(train.cont2, ax=axes[0,2], color="yellow", saturation=0.75)
sns.boxplot(train.cont3, ax=axes[1,0], color="yellow", saturation=0.75)
sns.boxplot(train.cont4, ax=axes[1,1], color="yellow", saturation=0.75)
sns.boxplot(train.cont5, ax=axes[1,2], color="yellow", saturation=0.75)
sns.boxplot(train.cont6, ax=axes[2,0], color="yellow", saturation=0.75)
sns.boxplot(train.cont7, ax=axes[2,1], color="yellow", saturation=0.75)
sns.boxplot(train.cont8, ax=axes[2,2], color="yellow", saturation=0.75)
sns.boxplot(train.cont9, ax=axes[3,0], color="yellow", saturation=0.75)
sns.boxplot(train.cont10, ax=axes[3,1], color="yellow", saturation=0.75)
sns.boxplot(train.cont11, ax=axes[3,2], color="yellow", saturation=0.75)
sns.boxplot(train.cont12, ax=axes[4,0], color="yellow", saturation=0.75)
sns.boxplot(train.cont13, ax=axes[4,1], color="yellow", saturation=0.75)
sns.boxplot(train.cont14, ax=axes[4,2], color="yellow", saturation=0.75)

### Regression plots:

In [None]:
fig,axes = plt.subplots(5,3, figsize=(30,30))
sns.regplot(train.id, train.target, ax=axes[0,0], color="purple")
sns.regplot(train.cont1, train.target,ax=axes[0,1], color="purple")
sns.regplot(train.cont2, train.target,ax=axes[0,2], color="purple")
sns.regplot(train.cont3, train.target,ax=axes[1,0], color="purple")
sns.regplot(train.cont4, train.target,ax=axes[1,1], color="purple")
sns.regplot(train.cont5, train.target,ax=axes[1,2], color="purple")
sns.regplot(train.cont6, train.target,ax=axes[2,0], color="purple")
sns.regplot(train.cont7, train.target,ax=axes[2,1], color="purple")
sns.regplot(train.cont8, train.target,ax=axes[2,2], color="purple")
sns.regplot(train.cont9, train.target,ax=axes[3,0], color="purple")
sns.regplot(train.cont10, train.target,ax=axes[3,1], color="purple")
sns.regplot(train.cont11, train.target,ax=axes[3,2], color="purple")
sns.regplot(train.cont12, train.target,ax=axes[4,0], color="purple")
sns.regplot(train.cont13, train.target,ax=axes[4,1], color="purple")
sns.regplot(train.cont14, train.target,ax=axes[4,2], color="purple")

### Let's have a closer look at more interesting features
1. cont2
2. cont3
3. cony9
4. cont14

In [None]:
plt.figure(figsize=(23,8))
sns.scatterplot(train.cont2, train.target, color="black")
plt.axvline(0.22, color='purple')
plt.axvline(0.35, color='violet')
plt.axvline(0.415, color='blue')
plt.axvline(0.481, color='green')
plt.axvline(0.549, color='yellow')
plt.axvline(0.612, color='orange')
plt.axvline(0.673, color='red')
plt.axvline(0.727, color='pink')
plt.axvline(0.747, color='black')
plt.title("Binning cont2")

In [None]:
plt.figure(figsize=(23,8))
sns.scatterplot(train.cont3, train.target)
plt.axvline(0.386, color="black")
plt.axvline(0.78, color="white")
plt.title("Binning cont3")

In [None]:
plt.figure(figsize=(23,8))
sns.scatterplot(train.cont9, train.target, color="green")
plt.axvline(0.108, color="red")
plt.axvline(0.1165, color="pink")
plt.axvline(0.559, color="magenta")
plt.axvline(0.84, color="orange")
plt.title("Binning cont9")

In [None]:
plt.figure(figsize=(23,8))
sns.scatterplot(train.cont14, train.target, color="blue")
plt.axvline(0.34, color="yellow")
plt.axvline(0.525, color="orange")
plt.axvline(0.66, color="red")
plt.axvline(0.78, color="pink")
plt.title("Binning cont14")

Binning the data for features:
1. cont2 - [0.22, 0.35, 0.415, 0.489, 0.549, 0.612, 0.673, 0.727, 0.747]
2. cont3 - [0.386, 0.78]
3. cont9 - [0.108, 0.1165, 0.559, 0.84]
4. cont14 - [0.34, 0.525, 0.66, 0.78]

In [None]:
lims2 = [-0.1,0.22, 0.35, 0.415, 0.489, 0.549, 0.612, 0.673, 0.727, 0.747,0.9]
lims3 = [0,0.386, 0.78,1.5]
lims9 =  [-0.2,0.108, 0.1165, 0.559, 0.84,5]
lims14 = [0,0.34, 0.525, 0.66, 0.78,0.9]

train["c14"] = pd.cut(train.cont14, bins=lims14, labels=np.arange(0,5), include_lowest=True)
train["c2"] = pd.cut(train.cont2, bins=lims2, labels=np.arange(0,10),  include_lowest=True)
train["c3"] = pd.cut(train.cont3, bins=lims3, labels=np.arange(0,3),include_lowest=True)
train["c9"] = pd.cut(train.cont9, bins=lims9, labels=np.arange(0,5), include_lowest=True)

Now converting this categorical data into binomial distribution for each feature state

In [None]:
train[["c14-0","c14-1","c14-2","c14-3","c14-4"]] = pd.DataFrame(pd.get_dummies(train.c14))
train[["c2-0","c2-1","c2-2","c2-3","c2-4","c2-5","c2-6","c2-7","c2-8","c2-9"]] = pd.DataFrame(pd.get_dummies(train.c2))
train[["c3-0","c3-1","c3-2"]] = pd.DataFrame(pd.get_dummies(train.c3))
train[["c9-0","c9-1","c9-2","c9-3","c9-4"]] = pd.DataFrame(pd.get_dummies(train.c9))

Dropping the categorical bins which we had created

In [None]:
train.drop(["c2","c3","c9","c14"],axis=1,inplace=True)

## Correlation of all features with target

In [None]:
train.drop(["target"], axis=1).corrwith(train.target).to_frame().sort_values(0, ascending=False).style.background_gradient(cmap="RdPu")

# Trial models

### Creating the predictor and target sets

In [None]:
x = train.drop(["target","id"],axis=1)
y=train.target

### Splitting the data into train and test sets

In [None]:
from sklearn.model_selection import train_test_split
x_tr, x_te, y_tr, y_te = train_test_split(x,y, test_size=0.33)

We will try with the following algorithms and use GridSearchCV to choose the parameters and the model. Please note that I have generally only mentioned the hyperparameter that I finally used for the model because my laptop is slow :(
1. XGBoost
2. CatBoost
3. KNeighborsRegressor
4. DecisionTreesRegressor
5. ExtraTreesRegressor
6. LightGBM


In [None]:
from sklearn.model_selection import GridSearchCV

### XGBoost

In [None]:
import xgboost as xgb
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', verbose=0)
XG = GridSearchCV(xg_reg, scoring='neg_mean_squared_error', cv=5,param_grid={'colsample_bytree':[0.1], 'learning_rate':[0.1, 0.01], 'max_depth':[16], 'alpha':[5], 'n_estimators':[50]})
XG.fit(x_tr,y_tr)
print(XG.score(x_te,y_te))

### CatBoost

In [None]:
import catboost as ctb
ctb_reg = ctb.CatBoostRegressor(verbose=0)
CB = GridSearchCV(ctb_reg, scoring='neg_mean_squared_error', cv=5,param_grid={'learning_rate':[0.1], 'max_depth':[12]})
CB.fit(x_tr,y_tr)
print(CB.score(x_te,y_te))

### KNeighboursRegressor

In [None]:
import sklearn.neighbors as knn
knn_reg = knn.KNeighborsRegressor(n_neighbors=5, algorithm='kd_tree')
KNN = GridSearchCV(knn_reg, scoring='neg_mean_squared_error', cv=5,param_grid={'n_neighbors':[5], 'weights':['uniform','distance']})
KNN.fit(x_tr,y_tr)
print(KNN.score(x_te,y_te))

### Decision trees

In [None]:
import sklearn.tree as t
dt_reg = t.DecisionTreeRegressor(criterion='mse')
DT = GridSearchCV(dt_reg, scoring='neg_mean_squared_error', cv=5, param_grid={'max_depth': [16]})
DT.fit(x_tr,y_tr)
print(DT.score(x_te,y_te))

### ExtratreesRegressor

In [None]:
import sklearn.tree as t
ext_reg = t.ExtraTreeRegressor(criterion='mse')
ET = GridSearchCV(ext_reg, scoring='neg_mean_squared_error', cv=5, param_grid={'max_depth': [16]})
ET.fit(x_tr, y_tr)
print(ET.score(x_te,y_te))

### LightGBM

In [None]:
import lightgbm as lgb
lgb = lgb.LGBMRegressor()
L = GridSearchCV(lgb, scoring='neg_mean_squared_error', cv=5, param_grid={'max_depth': [8,9,10,11,12,13,14,15,16], 'n_estimators':[1000,5000], 'learning_rate':[0.01]})
L.fit(x_tr,y_tr)
L.score(x_te,y_te)

## LightGBM offers the best result

# Final model

Applying all the transformations to test set:

In [None]:
lims2 = [-0.1,0.22, 0.35, 0.415, 0.489, 0.549, 0.612, 0.673, 0.727, 0.747,0.9]
lims3 = [0,0.386, 0.78,1.5]
lims9 =  [-0.2,0.108, 0.1165, 0.559, 0.84,5]
lims14 = [0,0.34, 0.525, 0.66, 0.78,0.9]


test["c14"] = pd.cut(test.cont14, bins=lims14, labels=np.arange(0,5), include_lowest=True)
test["c2"] = pd.cut(test.cont2, bins=lims2, labels=np.arange(0,10),  include_lowest=True)
test["c3"] = pd.cut(test.cont3, bins=lims3, labels=np.arange(0,3),include_lowest=True)
test["c9"] = pd.cut(test.cont9, bins=lims9, labels=np.arange(0,5), include_lowest=True)

test[["c14-0","c14-1","c14-2","c14-3","c14-4"]] = pd.DataFrame(pd.get_dummies(test.c14))
test[["c2-0","c2-1","c2-2","c2-3","c2-4","c2-5","c2-6","c2-7","c2-8","c2-9"]] = pd.DataFrame(pd.get_dummies(test.c2))
test[["c3-0","c3-1","c3-2"]] = pd.DataFrame(pd.get_dummies(test.c3))
test[["c9-0","c9-1","c9-2","c9-3","c9-4"]] = pd.DataFrame(pd.get_dummies(test.c9))
idzz=test.id
test.drop(["c2","c3","c9","c14","id"],axis=1,inplace=True)

In [None]:
model = L
model.fit(x,y)
yhat = model.predict(test)

In [None]:
output = pd.DataFrame({"Id":idzz, "target":yhat})
output.to_csv('submission.csv', index=False)