In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

Hello Kagglers!

In this notebook I will try to find out which of the regression algorithms can improve the score  for this particular dataset and position in Kaggle competition. I found this competition challenging as this dataset is a bit tricky and I think there isn't too much to improve as the dataset has been preprocessed for us (correct me if I'm wrong). The only thing which comes to my mind is to choose the right algorithm and find the right hyperparametres.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from scipy.stats import skew

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

In [None]:
sample_sub = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv')

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')
train_df

In [None]:
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/test.csv")
test_df

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.describe().T

#### Take a sample of a dataset

In [None]:
train_df = train_df.sample(frac=0.4)

### Outliers

In [None]:
outliers_df = pd.DataFrame({"skewness": train_df.drop("id", axis=1).skew(),
                            "kurtosis": train_df.drop("id", axis=1).kurtosis()})

outliers_df = outliers_df.sort_values(by='skewness', ascending=False)
outliers_df

In [None]:
fig, axes = plt.subplots(figsize=(14,4), nrows=1, ncols=2)

#kurtosis = outliers_df['']

sns.barplot(x=outliers_df.index, y=outliers_df['skewness'].values, palette="mako_r", ax=axes[0])
sns.barplot(x=outliers_df.index, y=outliers_df['kurtosis'].sort_values(ascending=False).values,
            palette='mako_r',ax=axes[1])


axes[0].set_title("Skewness")
axes[0].set_xticklabels(outliers_df.index,rotation=45)


axes[1].set_title("Kurtosis")
axes[1].set_xticklabels(outliers_df.index, rotation=45);


Kurtosis is a statistical measure that defines how heavily the tails of a distribution differ from the tails of a normal distribution. In other words, kurtosis identifies whether the tails of a given distribution contain extreme values.

Skewness is a measure of the asymmetry of the probability distribution of a real-valued random variable about its mean. The skewness value can be positive, zero, negative, or undefined.

Skewness essentially measures the relative size of the two tails. Kurtosis is a measure of the combined sizes of the two tails.

In [None]:
sns.distplot(train_df['target'], kde=True, color='r')

In [None]:
sns.boxplot(x=train_df['target'])

In [None]:
ser = pd.Series(train_df['target'])
ser.describe()

In [None]:
q1 = np.quantile(train_df['target'],0.25)
q3 = np.quantile(train_df['target'],0.75)

iqr = q3-q1

lower_outlier = q1 - (1.5*iqr)
upper_outlier = q3 + (1.5*iqr)

In [None]:
train_df_without_outliers = train_df[train_df['target'] >= lower_outlier].copy()

In [None]:
len(train_df_without_outliers)

In [None]:
sns.boxplot(x=train_df_without_outliers['target'])

In [None]:
sns.distplot(train_df_without_outliers['target'], kde=True)

Usually, at this point we would think about whether to drop outliers, but I did it already with previous ran and it had no effect on results.

### Correlation

In [None]:
fig, ax = plt.subplots(figsize=(12,8), dpi=150)
corr_matrix = train_df.drop('id',axis=1).corr()

sns.heatmap(corr_matrix, mask=corr_matrix < 0.8, annot=True, ax=ax, cmap='coolwarm')

In [None]:
corr_matrix['target'].sort_values(ascending=False)

In [None]:
train_df.drop("cont12", axis=1, inplace=True)
test_df.drop("cont12", axis=1, inplace=True)

In [None]:
corr_with_target = train_df.corr()['target'].sort_values(ascending=False)[1:]
corr_with_target = corr_with_target.drop('id')

In [None]:
import matplotlib as mpl

In [None]:
colors = list(reversed(px.colors.qualitative.Dark24))
#colors = list(reversed(['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A']))

fig = go.Figure(go.Bar(
    x = corr_with_target.values,
    y = corr_with_target.index,
    text = corr_with_target.values,
    textposition ="auto",
    texttemplate = "%{value:,s}",
    marker_color = colors,
    orientation = "h",
))
fig.update_traces(
    #marker_line_color = "black",
    marker_line_width = 1,
    opacity = 0.8,
)
fig.update_layout(
    title = "Features Correlation to the Target Column"
)
fig.show()

As always we should check for multicollinearity. In this case we can try to use regularization methods like Ridge and Lasso or ElasticNet).

#### Spearman’s Correlation

In [None]:
fig, ax = plt.subplots(figsize=(12,8), dpi=150)
sns.heatmap(train_df.drop('id',axis=1).corr(method='spearman'), annot=True, ax=ax, cmap='coolwarm')

A common aproach for highly correlated features is to do dimension reduction

In [None]:
num_cols = train_df.drop(["id", "target"], axis=1).columns

skewed_feat = train_df[num_cols].skew().sort_values(ascending=False)
skewed_feat = skewed_feat[skewed_feat > 0.5]
skewed_index = skewed_feat.index

In [None]:
for col in skewed_index:
    q3 = np.quantile(train_df[col], 0.75)
    q1 = np.quantile(train_df[col], 0.25)
    iqr = q3 - q1
    upper_limit = q3 + (1.5*iqr)
    lower_limit = q1 - (1.5*iqr)
    
    upper_col_bool = train_df[col].apply(lambda x: x <= upper_limit)
    lower_col_bool = train_df[col].apply(lambda x: x >= lower_limit)
    
    clean_train_df = train_df[upper_col_bool]
    clean_train_df = train_df[lower_col_bool]

### Normalize data

In [None]:
X = clean_train_df.drop(['target','id'], axis=1)
y = clean_train_df['target']
test = test_df.drop("id", axis=1).values
id_col = test_df['id'].values

In [None]:
sc = StandardScaler()
scaled_train = sc.fit_transform(X)
scaled_train = pd.DataFrame(scaled_train, columns=train_df.drop(['target','id'], axis=1).columns)

scaled_test = sc.transform(test)
scaled_test = pd.DataFrame(scaled_test, columns=test_df.drop("id", axis=1).columns)

In [None]:
from sklearn.metrics import r2_score

In [None]:
def evaluateModel(model):
    """
    This function evaluate the model with
    mean absolute error and root mean squared error
    """
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    score_r2 = r2_score(y_test, y_pred)
    
    print(f"MAE: {mae}")
    print(f"RMSE: {rmse}")
    print(f"R-square: {score_r2}")
    
    return mae, rmse, score_r2, y_pred, model

### Split the model

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(scaled_train, y, 
                                                    test_size=0.3, 
                                                    random_state=42)

## Linear regression

Linear Regression can not work on all datasets. For a linear regression algorithm to work properly, it has to pass at least the following five assumptions:

1. Linear relationship - the relation between independent and dependent features should be linear. Scatter plot is a good way to visualize it.
2. Multiviariate Normal - each variable seperatly needs to be bell shape curve. This can be tested by plotting a histogram.
3. No Multicollinearity - Multicollinearity happens when the independent variables are highly correlated with each other. Can be tested with correlation matrix.
4. No Autocorrelation - Autocorrelation means a single column data values are related to each other. Test it with scatterplot.
5. Homoscedasticity - This means “same variance” .In other words residuals are equal across regression line. Homoscedasticity can also be tested using scatter plot.

 ### ElasticNet CV

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
elastic_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], tol=0.01)

In [None]:
el_cv_mae, le_cv_rmse, lr_cv_r2, lr_y_pred, elastic_model = evaluateModel(elastic_model)

For l1_ratio = 0 the penalty is an L2 penalty(Ridge). For l1_ratio = 1 it is an L1 penalty(Lasso). It looks like Lasso will be better model to choose for.

### Lasso CV

In [None]:
from sklearn.linear_model import LassoCV

In [None]:
lassoCV_model = LassoCV(eps=0.01, n_alphas=100,cv=10)

In [None]:
la_cv_mae, la_cv_rmse, la_cv_r2, la_y_pred, lassoCV_model = evaluateModel(lassoCV_model)

In [None]:
lassoCV_model.get_params()

In [None]:
# Mean of target column
train_df['target'].mean()

### Residuals

In [None]:
residuals = pd.Series(y_test - la_y_pred,name='residuals')

In [None]:
residuals

In [None]:
sns.scatterplot(x=y_test, y=residuals)
plt.axhline(y=0, color='red', ls='--')

In [None]:
sns.distplot(residuals, bins=40, kde=True)

In [None]:
import scipy as sp

In [None]:
fig, ax = plt.subplots(figsize=(4,3), dpi=120)

_ = sp.stats.probplot(residuals, plot=ax)

#### Feature importance

In [None]:
lassoCV_model.coef_

In [None]:
coef_ser = pd.Series(lassoCV_model.coef_)
coef_ser = coef_ser.sort_values(ascending=False)

In [None]:
colors = list(reversed(px.colors.qualitative.Dark24))

fig = go.Figure(go.Bar(
    x = scaled_train.columns,
    y = coef_ser,
    text = coef_ser,
    textposition = 'auto',
    texttemplate = '%{value:,s}',
    marker_color = colors,
    orientation = 'v',
))
fig.update_traces(
    marker_line_width = 1,
    opacity = 0.8,
)
fig.update_layout(
    title = "Feature importances via coefficients in LassoCV"
)
fig.show()

In [None]:
# Let's check if our model is the best model

### Ridge CV

In [None]:
from sklearn.linear_model import RidgeCV

In [None]:
ridgeCV_model = RidgeCV(alphas=(0.1, 1.0, 10.0),scoring='neg_mean_absolute_error', cv=10)

In [None]:
rid_cv_mae, rid_cv_rmse, rid_cv_r2, rid_y_pred, ridgeCV_model = evaluateModel(ridgeCV_model)

In [None]:
ridgeCV_model.alpha_

In [None]:
ridgeCV_model.coef_

#### Residuals

In [None]:
residuals_ridge = pd.Series(y_test - rid_y_pred, name='residuals')
residuals_ridge

In [None]:
sns.distplot(residuals_ridge, bins=40, kde=True)

In [None]:
sns.scatterplot(x=y_test, y=residuals_ridge)
plt.axhline(y=0, color='red',ls='--')

In [None]:
fig, ax = plt.subplots(figsize=(4,3), dpi=120)

_ = sp.stats.probplot(residuals_ridge, plot=ax)

#### Feature importance

In [None]:
coef_ser = pd.Series(ridgeCV_model.coef_)
coef_ser = coef_ser.sort_values(ascending=False)

colors = list(reversed(px.colors.qualitative.Dark24))

fig = go.Figure(go.Bar(
    x = scaled_train.columns,
    y = coef_ser,
    text = coef_ser,
    textposition = 'auto',
    texttemplate = '%{value:,s}',
    marker_color = colors,
    orientation = 'v',
))
fig.update_traces(
    marker_line_width = 1,
    opacity = 0.8,
)
fig.update_layout(
    title = "Feature importances via coefficients in RidgeCV"
)
fig.show()

### SVM model

In [None]:
from sklearn.svm import LinearSVR

svr_model = LinearSVR(max_iter=1000000)

svr_mae, svr_rmse, svr_r2, svr_pred, svr_model = evaluateModel(svr_model)

#### Residuals

In [None]:
residuals_svr = pd.Series(y_test - svr_pred,name=' LinearSVC residuals')
residuals_svr

In [None]:
sns.distplot(residuals_svr, bins=40,kde=True)

In [None]:
sns.scatterplot(x=y_test, y=residuals_svr)
plt.axhline(y=0, color='red',ls='--')
plt.xlabel("y_actual")

It seems like this data set is not valid for linear regression.(If someone correct me if I am wrong). In other words if residuals plot shows clear pattern, Linear Regression is propably not a good choice.

There is constant error between residuals and actual data which leads us to very sophisticated term  homoscesdasticity, the word I still struggle to pronounce it :).

In [None]:
fig, ax = plt.subplots(figsize=(4,3), dpi=120)

_ = sp.stats.probplot(residuals_svr, plot=ax)

plt.title("Probability plot")

There is an issue with dataset because residuals are skewing from linear regression line.

#### Feature importance

In [None]:
svr_model.coef_

In [None]:
coef_ser = pd.Series(svr_model.coef_)
coef_ser = coef_ser.sort_values(ascending=False)

colors = list(reversed(px.colors.qualitative.Dark24))

fig = go.Figure(go.Bar(
    x = scaled_train.columns,
    y = coef_ser,
    text = coef_ser,
    textposition = 'auto',
    texttemplate = '%{value:,s}',
    marker_color = colors,
    orientation = 'v',
))
fig.update_traces(
    marker_line_width = 1,
    opacity = 0.8,
)
fig.update_layout(
    title = "Feature importances via coefficients in LinearSVR"
)
fig.show()

### Training final models

In [None]:
from sklearn.linear_model import Lasso, Ridge

In [None]:
# lasso_model = Lasso(alpha=lassoCV_model.alpha_).fit(scaled_train, y)

In [None]:
# ridge_model = Ridge(alpha=ridgeCV_model.alpha_).fit(scaled_train, y)

In [None]:
# linearSVR_model = LinearSVR(max_iter=1000000).fit(scaled_train, y)

## Make predictions and save it.

In [None]:
#lasso_sub = pd.DataFrame(data={'id': id_col,
                               #'target':lassoCV_model.predict(scaled_test)})

#ridge_sub = pd.DataFrame(data={'id':id_col,
                               #'target':ridge_model.predict(scaled_test)})

#linearSVR_sub = pd.DataFrame(data={'id':id_col,
                                   #'target': linearSVR_model.predict(scaled_test)})


#print(len(lasso_sub['id']) == len(sample_sub['id']))
#print(len(ridge_sub['id']) == len(sample_sub['id']))
#print(len(linearSVR_sub['id']) == len(sample_sub['id']))

In [None]:
# lasso_sub.to_csv("submission_lasso.csv", index=False)
# ridge_sub.to_csv("submission_ridge.csv", index=False)
# linearSVR_sub.to_csv("submission_linearSVR.csv")

### Search for the best hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

lasso_model = Lasso(max_iter=1000000)

param_grid = {'alpha': [0.005, 0.02, 0.03, 0.05, 0.06, 0.1, 0.5, 1, 10, 100]}

lasso_grid = GridSearchCV(lasso_model,param_grid, cv=10, scoring='neg_mean_squared_error')
lasso_grid.fit(X_train, y_train)

In [None]:
lasso_grid.best_params_

In [None]:
abs(lasso_grid.best_score_)

In [None]:
lasso_y_grid = lasso_grid.predict(X_test)

lasso_grid_mae = mean_absolute_error(y_test, lasso_y_grid)

lasso_grid_rmse = np.sqrt(mean_squared_error(y_test, lasso_y_grid))

In [None]:
lasso_grid_mae

In [None]:
lasso_grid_rmse

### Best Lasso Submission

In [None]:
best_lasso_model = Lasso(alpha=0.005)

In [None]:
best_lasso_model.fit(X_train, y_train)

In [None]:
best_lasso_y_pred = best_lasso_model.predict(X_test)

In [None]:
best_lasso_mae = mean_absolute_error(y_test, best_lasso_y_pred)
best_lasso_rmse = np.sqrt(mean_squared_error(y_test, best_lasso_y_pred))

In [None]:
best_lasso_mae

In [None]:
best_lasso_rmse

#### Submmit to Kaggle

In [None]:
final_best_lasso = Lasso(alpha=0.005)

final_best_lasso.fit(scaled_train, y)

best_lasso_sub = pd.DataFrame({'id': id_col,'target': final_best_lasso.predict(scaled_test)})

best_lasso_sub.to_csv("submission_best_lasso.csv", index=False)

### GridSearch CV for Ridge Regression

In [None]:
param_grid = {'alpha':[0.01, 0.05, 0.1, 1.0, 10.0],
              'solver':['auto', 'svd', 'cholesky', 'lsqr', 'saga']}

best_ridge_model = Ridge(max_iter=1000000)

grid_ridge = GridSearchCV(best_ridge_model, param_grid, cv=10, 
                          scoring='neg_mean_squared_error', verbose=0)

grid_ridge.fit(X_train, y_train)

In [None]:
grid_ridge.best_params_

In [None]:
abs(grid_ridge.best_score_)

In [None]:
# Make predictions

In [None]:
ridge_grid_pred = grid_ridge.predict(X_test)

ridge_grid_mae = mean_absolute_error(y_test, ridge_grid_pred)
ridge_grid_rmse = np.sqrt(mean_squared_error(y_test, ridge_grid_pred))

In [None]:
ridge_grid_mae

In [None]:
ridge_grid_rmse

In [None]:
# No Submmission here

### Polynomial Regression

Running this code in kaggle notebook could use most of you alocated memory and the notebook can stop responding, so if you want to run it make sure you have enough memory. This algorithm didn't improve root mean squared error when I run it, beside there is no chance of using elbow method to find if any degree can yield better results.

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [None]:
# poly_converter = PolynomialFeatures(degree=3, include_bias=False)
# poly_features = poly_converter.fit_transform(X)
    
# X_train, X_test, y_train, y_test = train_test_split(poly_features,
                                                    # y,
                                                    # test_size=0.3,
                                                    # random_state=42)

# poly_reg_model = LinearRegression()
# poly_reg_model.fit(X_train, y_train)

# poly_pred = poly_reg_model.predict(X_test)

# poly_mae = mean_absolute_error(y_test, poly_pred)
# poly_rmse = np.sqrt(mean_squared_error(y_test, poly_pred))


In [None]:
# poly_mae

In [None]:
# poly_rmse

#### Submmit to Kaggle

In [None]:
# full_poly_reg = LinearRegression()
# full_poly_reg.fit(poly_features, y)

# poly_test_features = poly_converter.transform(test)

# poly_sub = pd.DataFrame({'id': id_col,'target': full_poly_reg.predict(poly_test_features)})

# poly_sub.to_csv("submission_poly.csv", index=False)

### GridSearch CV for SVR

This will takes ages in my computer to find the right hyperparameters. Searching for best parameters can be exhausting and in some cases can break down. So, purely because of the time needed for gridsearch and memory I create smaller datasets to reduce computing time. I will also search only for C values in this case 

In [None]:
#from sklearn.svm import SVR

#svr_model = SVR()

#param_grid = {'C':[0.001, 0.1, 1, 10],
              #'max_iter':[1000, 10000]}

#grid_svr = GridSearchCV(svr_model, param_grid, cv=2,verbose=1)

#grid_svr.fit(X_train, y_train)

#grid_svr_pred = grid_svr.predict(X_test)

#grid_svr_mae = mean_absolute_error(y_test, grid_svr_pred)

#grid_svr_rmse = np.sqrt(mean_squared_error(y_test, grid_svr_pred))

In [None]:
#grid_svr_mae

In [None]:
#grid_svr_rmse

### Gradient Descent Algorithm for Regression 

I found out that linear regression algorithms are not suited for this dataset, and there is little improvement in rmse for these models. I also find out that algorithms using loss function are performing better, therefore I try XGBoost. The algorithm is an implementation of the gradient boosting ensemble algorithm for classification and regression.

#### SGDRegressor

In [None]:
from sklearn.linear_model import SGDRegressor

sgdr_model = SGDRegressor()

In [None]:
sgdr_mae, sgdr_rmse, sgdr_2r, sgdr_y_pred, sgdr_model = evaluateModel(sgdr_model)

#### KNeighborsRegressor

Some of the  algorithms below run too long when I run it on kaggle notebook. So I am not going to do it again( no more time to do it).

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn_model = KNeighborsRegressor(n_neighbors=50)

In [None]:
knn_mae, knn_rmse, knn_r2, knn_y_pred, knn_model = evaluateModel(knn_model)

#### Griedient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_model = GradientBoostingRegressor(n_estimators=200)

In [None]:
gbr_mae, gbr_rmse, gbr_r2, gbr_y_pred, gbr_model = evaluateModel(gbr_model)

This looks promising, let's tune hyperparameters and check if that improve our results.

#### GridSearch CV for Gradient Boosting Regressor

In [None]:
#param_grid = {"n_estimators":[300,1000], # previously [10,100,200,300]
              #"learning_rate":[0.01, 0.1, 1.0],
              #"max_features":["auto","sqrt"]}

#gr_boost_reg = GradientBoostingRegressor()

#grid_gbr_model = GridSearchCV(gr_boost_reg, param_grid,cv=2)

#grid_gbr_model.fit(X_train, y_train)

In [None]:
# grid_gbr_model.best_params_

In [None]:
# grid_gbr_model.best_score_

In [None]:
#gbr_cv_mae, gbr_cv_rmse, gbr_cv_r2, gbr_y_pred, grid_gbr_model = evaluateModel(grid_gbr)

#### Submmit to Kaggle

In [None]:
best_gbr = GradientBoostingRegressor(n_estimators=1000,
                                     learning_rate=0.1,
                                     loss="ls",
                                     max_features='sqrt')

gbr_cv_mae, gbr_cv_rmse, gbr_cv_r2, gbr_y_pred, grid_gbr_model = evaluateModel(best_gbr)


# full_data_gbr.fit(scaled_train, y)

In [None]:
# gbr_sub = pd.DataFrame({"id":id_col,
                        # "target":full_data_gbr.predict(scaled_test)})
    
# gbr_sub.to_csv("submission_gbr.csv", index=False)

Submission scored 0.70774, which gave 727 from 1049 position in the scoreboard on 19.01.2020. Not bad, Let's se if we can tune it better with slightly different hyperparameters.

1. MAE: 0.5956168406066421
2. RMSE: 0.7110476866137042


Hyperparameters found by GridSearch CV:

{'learning_rate': 0.1, 'loss': 'huber', 'n_estimators': 300}

#### GradientBoostingRegressor with best hyperparameters found with GridSearchCV

Submission scored 0.70565, which is an improvement of your previous score of 0.70774. More tunning yielded small improvment and placed on scoreboard 706.

Hyperparameters:

{'learning_rate': 0.1,
 'loss': 'ls',
 'max_features': 'sqrt',
 'n_estimators': 1000}
 
Score:

1. MAE: 0.5937034963056688
2. RMSE: 0.7088659238713398

Good practice is to check for Variance-Bias Trade-Off by tunning in this case "n_estimators" hyperparameter and keep record of rmse for test and train test. This algorithm is fairly robust to overfitting so a large number usually results with better performance.

### Random Forest Regressor

In [None]:
# from sklearn.ensemble import RandomForestRegressor

# rfc_model = RandomForestRegressor()

# rfc_model.fit(X_train, y_train)

# rfc_mae, rfc_rmse, rfc_r2, rfc_y_pred, rfc_model = evaluateModel(rfc_model)

In [None]:
# rfc_model.feature_importances_

In [None]:
#coef_ser = pd.Series(rfc_model.feature_importances_)
#coef_ser = coef_ser.sort_values(ascending=False)

#colors = list(reversed(px.colors.qualitative.Dark24))

#fig = go.Figure(go.Bar(
    #x = scaled_train.columns,
    #y = coef_ser.values,
    #text = coef_ser.values,
    #textposition = 'auto',
    #texttemplate = '%{value:,s}',
    #marker_color = colors,
    #orientation = 'v',
#))
#fig.update_traces(
   # marker_line_width = 1,
    #opacity = 0.8,
#)
#fig.update_layout(
    #title = "Feature importances via coefficients in LassoCV"
#)
#fig.show()

#### GridSearch CV for RandomForestRegressor

In [None]:
# param_grid = {"n_estimators":[300, 500, 1000],
              # "max_depth":[2,3,5],
              # "max_features":["auto","sqrt"]}

# rfr_model = RandomForestRegressor()

# rfr_grid_model = GridSearchCV(rfr_model,param_grid, cv=2)

# rfr_grid_mae, rfr_grid_rmse, rfr_grid_r2, rfr_grid_y, rfr_grid_model  = evaluateModel(grid_rfr)

In [None]:
# No improvements with this algorithm.

### Artificial Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, LeakyReLU
from tensorflow.keras.optimizers import Adam

In [None]:
ann_model = Sequential()


# input layer
ann_model.add(Dense(78, activation='relu', dtype='float32'))

# hidden layer
ann_model.add(Dense(78, activation='relu', dtype='float32'))
ann_model.add(Dropout(0.5))

# hidden layer
ann_model.add(Dense(78, activation='relu', dtype='float32'))
ann_model.add(Dropout(0.5))

# hidden layer
ann_model.add(Dense(38, activation='relu',dtype='float32'))
ann_model.add(Dropout(0.5))

# output layer
ann_model.add(Dense(1, activation='relu', dtype='float32'))

ann_model.compile(optimizer='adam', loss='mae')  # rmsprop, adam

In [None]:
ann_model.fit(x=X_train, y=y_train,
              validation_data=(X_test, y_test),
              batch_size=64, epochs=15,verbose=0)

In [None]:
losses = pd.DataFrame(ann_model.history.history)

In [None]:
losses.plot()

#### Evaluate the model

In [None]:
# ann_model_pred = ann_model.predict(X_test)

In [None]:
ann_mae, ann_rmse, ann_r2, ann_y_pred, ann_model  = evaluateModel(ann_model)

I bolive there is still a room for improvement by better parameters tunning but this is not the purpose of this notebook, but for now my winner so far is Griedient Boosting Regressor. Finally, I would like to try last algorythm which is XGBoost.


XGBoost (Extreme Gradient Boosting) belongs to a family of boosting algorithms and uses the gradient boosting (GBM) framework at its core. It is an optimized distributed gradient boosting library. XGBoost is well known to provide better solutions than other machine learning algorithms. In fact, since its inception, it has become the "state-of-the-art” machine learning algorithm to deal with structured data.

In [None]:
import xgboost as xgb

In [None]:
data_dmatrix = xgb.DMatrix(data=scaled_train,
                           label=y)

In [None]:
data_dmatrix

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 500)

xgb_mae, xgb_rmse, xgb_r2, xgb_y_pred, xgb_model = evaluateModel(xg_reg)

In [None]:
xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()

#### Submmit to Kaggle

In [None]:
# xgb_final = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
               #  max_depth = 5, alpha = 10, n_estimators = 500)

# xgb_final.fit(scaled_train.values,y)

In [None]:
# xgb_pred = xgb_final.predict(scaled_test.values)

# xgb_sub = pd.DataFrame({"id":id_col, "target":xgb_pred})
# xgb_sub.to_csv("submission_xgboost.csv", index=False)

This is great. Submission scored 0.70400, which is an improvement of your previous score of 0.70565.
654 position on Kaggle scoreboard on 20.01.2020. Let's see if we can tune our model and get better results.

In [None]:
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

In [None]:
cv_results.describe().T

#### XGBoost tunning hyperparameters

In [None]:
#param_grid = {'nthread':[1], #when use hyperthread, xgboost may become slower
              #'objective':['reg:squarederror'],
              #'learning_rate': [.03, 0.05, .07], #so called `eta` value
              # 'max_depth': [5, 6, 7],
              #'alpha':[0.1, 1, 10],
              #'colsample_bytree': [0.3, 0.7],
              #'n_estimators': [500, 1000]}

#xgb1 = xgb.XGBRegressor()

#xgb_grid_model = GridSearchCV(xgb1, 
                              #param_grid,
                              #cv=2,
                              #verbose=1)

#xgb_grid_mae , xgb_grid_rmse, xgb_grid_r2, xgb_grid_y, xgb_grid_model = evaluateModel(xgb_grid_model)

In [None]:
# xgb_grid.best_params_

#### Kaggle Submission

In [None]:
xgb_best_model = xgb.XGBRegressor(n_estimators=1000,
                                  learning_rate=0.03, 
                                  colsample_bytree=0.3, 
                                  alpha=10)

xgb_best_mae , xgb_best_rmse, xgb_best_r2, xgb_best_y, xgb_best_model = evaluateModel(xgb_best_model)

In [None]:
xgb_best_model.fit(scaled_train, y)

In [None]:
xgb_best_sub = pd.DataFrame({"id": id_col,
                             "target": xgb_best_model.predict(scaled_test)})

In [None]:
xgb_best_sub.to_csv("submission_xgb_best.csv", index=False)

Submission scored 0.70303, which is an improvement of your previous score of 0.70400. 

Kaggle scorboard position:  628

In [None]:

from sklearn.ensemble import VotingRegressor
ensemble_model = VotingRegressor(estimators=[
    ("svr_base", svr_model),
    ("lasso_gr", lasso_grid.estimator),
    ("ridge_gr", grid_ridge.estimator),
    ("sgdr_base", sgdr_model),
    ("knn_base", knn_model),
    ("gbr_base", gbr_model),
    ("gbr_best", grid_gbr_model),
    ("xgb_best", xgb_best_model),])

In [None]:
ensemble_model.fit(scaled_train, y)

In [None]:
ensemble_pred = ensemble_model.predict(scaled_test)

ensemble_sub = pd.DataFrame({"id":id_col,
                             "target": ensemble_pred})

ensemble_sub.to_csv("ensemble_tb_series_sub.csv", index=False)

**StackingRegressor**

In [None]:
from sklearn.ensemble import StackingRegressor

In [None]:
reg = StackingRegressor(estimators=[("sgdr_base", sgdr_model),
                                    ("knn_base", knn_model),
                                    ("gbr_best", grid_gbr_model)],
                                    final_estimator = xgb_best_model)


In [None]:
reg.fit(scaled_train, y)

In [None]:
reg_sub = pd.DataFrame({"id":id_col,
                        "target": reg.predict(scaled_test)})

reg_sub.to_csv("stacking_reg_tps_sub.csv", index=False)

### Table with models and their score

In [None]:
rmse_score_df = pd.DataFrame({
    "Model": ["LassoCV","RidgeCV","LinearSVR","Lasso GridSearchCV","Ridge GridSearchCV",
              "SGDRegressor","KNeighborsRegressor","GradientBoostingRegressor",
              "GradientBoostingRegressor GridSearchCV","RandomForestRegressor",
              "RandomForestRegressor GridSearchCV","ANN Regression","XGBRegressor",
              "XGBRegressor GridSearchCV"],
    "RMSE":[lasso_rmse, ridge_rmse,svr_rmse,
            lasso_grid_rmse,ridge_grid_rmse,
            sgdr_rmse,knn_rmse, gbr_rmse,
            gbr_cv_rmse,rfc_rmse, rfr_rmse,
            ann_rmse, xgb_rmse, xgb_grid_rmse]})

In [None]:
rmse_score_df = rmse_score_df.sort_values(by=['RMSE'], ascending=True).reset_index()
rmse_score_df = rmse_score_df.drop('index', axis=1)
rmse_score_df

## Conclusion

It seems to me that in this particular dataset gradient boosting and decision trees algorithms perform much better than linear models. I also realize that there might be more improvement if you go deeper into hyperparameters and tune chosen model. Another suggestion could be dealing with correlated data. One popular rank correlation method in ML is the Principal Component Analysis. It’s a technique to find patterns in high dimensional data.

I am sure there is more to explore to make predictions which would result with lower RMSE, but it is beyond my scope for now. There is still so much to learn...

After this challange I found more question than the answers. One main question remains "What else I could do to improve model score?, any suggestion please leave feedback.

Be aware, that if you want to run this notebook it will take you 4 hours without the models I have comment.

I hope you find this notebook interesting.

In [None]:
pip install pytorch-tabnet==3.1.0

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

In [None]:
# Load the data
# X = train_df.drop(columns=['target']).values
# sc = StandardScaler()
# x=sc.fit_transform(x)
# y = train_df['target'].values.reshape(-1, 1)

In [None]:
# train models with AutoML
re = TabNetRegressor()  #TabNetRegressor()
re.fit(
  p_X_train, y_train,
  eval_set=[(p_X_val, y_val)],
   eval_name=['train'],
    eval_metric=['rmsle', 'mae', 'rmse', 'mse'],
    max_epochs=1000,
    patience=50,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)
preds = re.predict(p_X_val)

In [None]:
len(preds)

In [None]:
print("Test MSE:", mean_squared_error(y_val, preds, squared=False))

In [None]:
preds

In [None]:
preds = re.predict(p_test).reshape(-1)
submission = pd.DataFrame({"id":id_col,
                          "target":preds})
submission.to_csv('submission_pytorch.csv', index = False)

## LGBM model

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgbm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [None]:
cont_features = [col for col in train_df.columns if col.startswith("cont")]
len(cont_features)

In [None]:
#X = X.abs()
y = train_df["target"]
kf = KFold(n_splits=5, shuffle=True, random_state=1)
oof = np.zeros(len(train_df))
score_list = []
fold = 1
test_preds = []


for train_index, test_index in kf.split(train_df):
    X_train, X_val = train_df.iloc[train_index], train_df.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    

    X_train = X_train.abs()
    
    

    X_train = X_train.abs()

    
    y_pred_list = []
    for seed in [1]:
        dtrain = lgbm.Dataset(X_train[cont_features], y_train)
        dvalid = lgbm.Dataset(X_val[cont_features], y_val)
        print(seed)
        params = {"objective": "regression",
              "metric": "rmse",
              "verbosity": -1,
              "boosting_type": "gbdt",
              "feature_fraction":0.5,
              "num_leaves": 200,
              "lambda_l1":2,
              "lambda_l2":2,
              "learning_rate":0.01,
              'min_child_samples': 50,
              "bagging_fraction":0.7,
              "bagging_freq":1}
        params["seed"] = seed
        model = lgbm.train(params,
                        dtrain,
                        valid_sets=[dtrain, dvalid],
                        verbose_eval=100,
                        num_boost_round=100000,
                        early_stopping_rounds=100
                    )
    
        y_pred_list.append(model.predict(X_val[cont_features]))
        test_preds.append(model.predict(test[cont_features]))
        
        
        oof[test_index] = np.mean(y_pred_list,axis=0)    
    score = np.sqrt(mean_squared_error(y_val, oof[test_index]))
    score_list.append(score)
    print(f"RMSE Fold-{fold} : {score}")
    fold+=1

np.mean(score_list)

In [None]:
y = train["target"]
kf = KFold(n_splits=5, shuffle=True, random_state=1)
oof = np.zeros(len(train))
score_list = []
fold = 1
test_preds = []


for train_index, test_index in kf.split(train):
    X_train, X_val = train.iloc[train_index], train.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    

    X_train = X_train.abs()
