In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Outline of this notebook:

## Exploratory Data Analysis (EDA): 
Exploration of the dataset through visualization and analyzation to summarize thier main characteristic and gives us a insight on the dataset.

* [Quick Peak](#Quick-Peak)
* [Quantitative Data & Qualitative Data](#Quantitative-Data-&-Qualitative-Data)
* [Missing Values](#Missing-Value)
* [Multicollinearity](#Multicollinearity)

## [Data Processing](#Data-Processing): 
Manipulate explored and analyzed data to convert them into meaningful information that can be use the models or estimators.

* [Imputing Missing Values](#Imputing-Missing-Values)
* [Normalization of Dependant Variable (revenue)](#Normalization-of-Dependant-Variable-(revenue))
* [Adding New Features](#Adding-New-Features)
* [Eliminating Biased Features](#Eliminating-Biased-Features)
* [Create Dummy Variables](#Create-Dummy-Variables)

## [Models Building](#Model-Building):
****Find the optimised parameters for all the models. Evaluate high performance mode and use them for ensemble regression****

* [Train Test Split](#Train-Test-Split)
* [GridSearchCV Best Parameters for All Models](#GridSearchCV-Best-Parameters-for-All-Models)
* [Evaluate All Optimized Estimators](#Evaluate-All-Optimized-Estimators)
* [Ensemble Models with VotingRegressor](#Ensemble-Models-with-VotingRegressor)
* [Submission](#Submission)

These models are:

1. Ridge
1. Lasso
1. Elastic Net
1. HuberRegressor
1. RandomForestRegressor
1. ExtraTreeRegressor
1. BaggingRegressor
1. XGBRegressor
1. DecisionTreeRegressor
1. AdaBoostRegressor
1. GradientBoostRegressor

In [None]:
### import necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt #Visualization packages
import missingno as msno
import seaborn as sns
import warnings
warnings.filterwarnings('ignore') ## ignore all warnings

In [None]:
## Lead the train and test dataset
df_train = pd.read_csv("/kaggle/input/restaurant-revenue-prediction/train.csv.zip")
df_test = pd.read_csv("/kaggle/input/restaurant-revenue-prediction/test.csv.zip")

### Quick Peak

In [None]:
## Take a quick peak on the train data
df_train.head()

In [None]:
## Shape of train data
df_train.shape

The train data has a total of 137 rows and 43 columns. "revenue" feature would be this dataset dependant variables, the rest of the features are independant variables expect "Id" feature (We will remove this later)

In [None]:
## Take a quick peak on the test data
df_test.head()

In [None]:
## Shape of test data
df_test.shape

The test data has a total fo 100000 rows and 42 columns.

In [None]:
## All features
df_train.info()

### Data Description (Referred from [here](https://www.kaggle.com/c/restaurant-revenue-prediction/data))
* Id : Restaurant id. 
* Open Date : opening date for a restaurant
* City : City that the restaurant is in. Note that there are unicode in the names. 
* City Group: Type of the city. Big cities, or Other. 
* Type: Type of the restaurant. FC: Food Court, IL: Inline, DT: Drive Thru, MB: Mobile
* P1, P2 - P37: There are three categories of these obfuscated data. Demographic data are gathered from third party providers with GIS systems. These include population in any given area, age and gender distribution, development scales. Real estate data mainly relate to the m2 of the location, front facade of the location, car park availability. Commercial data mainly include the existence of points of interest including schools, banks, other QSR operators.
* Revenue: The revenue column indicates a (transformed) revenue of the restaurant in a given year and is the target of predictive analysis. Please note that the values are transformed so they don't mean real dollar values. 

In [None]:
## drop the Id columns
df_train = df_train.drop("Id",axis=1)

### Quantitative Data & Qualitative Data
1. Quantitative data is the type of data ****whose value is measured in the form of numbers or counts, with a unique numerical value associated with each data set****
1. Qualitative data is defined as the data that ****approximates and characterizes****

In [None]:
quantitative_feats = [i for i in df_train if df_train[i].dtype != np.object]
qualitative_feats = [i for i in df_train if df_train[i].dtype == np.object]

print("Quantitative_features: {}".format(quantitative_feats))
print("Qualtative_features: {}".format(qualitative_feats))

There a total of 38 quantitaive features and 4 qualitative features.

In [None]:
## remove the dependant variables out 
quantitative_feats.remove("revenue")

## remove "Open Date"(Temporal variable) out 
qualitative_feats.remove("Open Date")

### Temporal variable
The Open Date features present its data in a form of MM/DD/YY (for example: 01/22/2011), we extract the months and years out using the Code below.

In [None]:
## dependant variable 
revenue = df_train["revenue"]

In [None]:
## months extraction
month = pd.DataFrame()
month["months"] = df_train["Open Date"].transform(lambda x: int(x.split("/")[0]))
month = pd.concat([month,revenue],axis=1)

## revenue based on months
revenue_month = month.groupby("months")["revenue"].median()


### Visualization
fig, ax = plt.subplots(2,1,sharey=False,figsize=(16,9),constrained_layout=True)

bar_1 = sns.countplot(month["months"],ax=ax[0],color="green") ## Number of Revenue based on month
for i in bar_1.patches: #label each rectangle
    height = i.get_height()
    ax[0].text(i.get_x()+i.get_width()/2,height*1.02,height,ha="center",fontsize=9)
ax[0].set_title("Revenue Count",fontsize=14)

bar_2 = sns.barplot(revenue_month.index,revenue_month.values,ax=ax[1],color="red") ## Median Revenue
for i in bar_2.patches: #label each rectangle
    height = i.get_height()
    ax[1].text(i.get_x()+i.get_width()/2,height*1.02,height,ha="center",fontsize=10)
ax[1].set_title("Median Revenue",fontsize=14)

From the median revenue chart, we can see that September month has the highest revenue while July month has the lowest revenue. 

In [None]:
## years extraction
year = pd.DataFrame()
year["years"] = df_train["Open Date"].transform(lambda x: int(x.split("/")[-1]))
year = pd.concat([year,revenue],axis=1)

## revenue based on months
revenue_year = year.groupby("years")["revenue"].median()

### Visualization
fig, ax = plt.subplots(2,1,sharey=False,figsize=(16,9),constrained_layout=True)

bar_1 = sns.countplot(year["years"],ax=ax[0],color="green") ## Number of Revenue based on month
for i in bar_1.patches: #label each rectangle
    height = i.get_height()
    ax[0].text(i.get_x()+i.get_width()/2,height*1.02,height,ha="center",fontsize=10)
ax[0].set_title("Revenue Count",fontsize=14)
ax[0].set_ylabel("Number of Opening")

bar_2 = sns.barplot(revenue_year.index,revenue_year.values,ax=ax[1],color="red") ## Median Revenue
for i in bar_2.patches: #label each rectangle
    height = i.get_height()
    ax[1].text(i.get_x()+i.get_width()/2,height*1.02,height,ha="center",fontsize=11)
ax[1].set_title("Median Revenue",fontsize=14)
ax[1].set_ylabel("Revenue")

From the median revenue chart, we can see that year 2000 has the highest revenue while year 2013 has the lowest revenue. 

Two types of Quantitative data:

1. Discrete data: Numerical data that has specific values. A great example would be the number of dogs. The number of dogs  are counted as 1 dog, 2 dogs, 3 dogs. There is no such thing as 0.5 dog.
1. Continuous data: Numerical data that can take on any values. A great example would be the the height of a person. Donald is 6 foot or 182.88cm tall. 

In [None]:
## Discrete data
discrete_feats = [feat for feat in quantitative_feats if df_train[feat].nunique() <= 25]

discrete_feats

In [None]:
## Visualization
for feat in discrete_feats:
    plt.figure(constrained_layout=True)
    sns.scatterplot(df_train[feat],df_train["revenue"])

There's no obvious correlation between the discrete features with the revenue.

In [None]:
## Continous Data
continuous_feats = [feat for feat in quantitative_feats if df_train[feat].nunique() > 25]

continuous_feats

It seems like this dataset doesn't contain continuous features.

### Categorical Data

In [None]:
for feat in qualitative_feats:
    groupby_feat = df_train.groupby(feat)["revenue"].median()

    ## Visualization
    fig, ax = plt.subplots(2,1,figsize=(20,9),constrained_layout=True)
    bar_1 = sns.countplot(df_train[feat],ax=ax[0])
    for bar in bar_1.patches:
        height = bar.get_height()
        ax[0].text(bar.get_x()+bar.get_width()/2,height*1.02,height,ha="center")
    ax[0].set_title("Revenue Count ({})".format(feat))
    ax[0].set_ylabel("Count")
    ax[0].set_xticklabels(ax[0].get_xticklabels(),rotation=90)

    bar_2 = sns.barplot(groupby_feat.index,groupby_feat.values,ax=ax[1])
    for bar in bar_2.patches:
        height = bar.get_height()
        ax[1].text(bar.get_x()+bar.get_width()/2,height*1.02,height,ha="center")
    ax[1].set_title("Revenue based on {}".format(feat))
    ax[1].set_ylabel("Revenue")
    ax[1].set_xticklabels(ax[1].get_xticklabels(),rotation=90)
    


In the city variable visualization: 
* Elazig city has the highest median revenue while Kirklareli city has the lowest median revenue. 
* Istanbul and Ankara have 50 and 19 restaurant openings respectively, the rest of the cities have less than ten restaurant openings.

In the city group variable:
* Big cities have more restaurant openings compared to other city group.
* Big cities have higher revenue median compared to other city group.

In the type variable:
* Food court is the most popular restaurant type follow by inline and drive thru being the least popular.
* Inline restaurants have the highest median revenue compared to both drive thru and food court.

In [None]:
## years extraction
year = pd.concat([year,df_train["City"]],axis=1)

## Select cities that has equal or more than 5 count of revenue
cities = ["İstanbul","Ankara","Bursa","İzmir","Samsun"]


## Visualization
fig, ax = plt.subplots(5,1,figsize=(25,35),constrained_layout=True)
for num,city in enumerate(cities):
    df_city = year.loc[year["City"] == city, :]
    df_city = df_city.sort_values("years")
    
    ## Line plot
    sns.lineplot(df_city["years"],df_city["revenue"],ax=ax[num],linestyle="-")
    ax[num].set_title(city)
    for label in ax[num].get_xticklabels():
        label.set_rotation(90)

### Missing Value
Check on the presence of missing value for the dataset

In [None]:
## missing value
print(df_train.isnull().any())

Great News ! Looks like we don't have presence of missing value. 

### Multicollinearity
Heatmap is an excellent way to check on the correclation between each independant variable.

In [None]:
## Create a lower triangle heatmap
mask = np.zeros_like(df_train.corr(),dtype=bool)
mask[np.triu_indices_from(mask)] = True

## Heatmap visualization
plt.figure(figsize=(30,20))
sns.heatmap(df_train.corr(),
            annot=True,
            fmt=".3f",
            annot_kws = {"size":10},
            cmap=sns.cubehelix_palette(),
            mask=mask)

Multicollinearity is a problem because it undermines the statistical significance of an independent variable. As we can see from the heatmap, there are a lot of correlation between independant variables that have high similarity to each other. Code below check on the VIF(Variable Inflation Factors) of each feature.  

Variable Inflation Factors: VIF score of an independent variable represents how well the variable is explained by other independent variables. 
For more detail explanation on VIF, click [here](https://www.analyticsvidhya.com/blog/2020/03/what-is-multicollinearity/)

In [None]:
# Import library for VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    

    return(vif)

## Calcualte the VIF of the quantitaive features
vif = calc_vif(df_train[quantitative_feats])

## VIF exceeding 5 or 10 indicates high multicollinearity between this independent variable and the others
high_vif = vif[vif["VIF"] > 10].sort_values("VIF",ascending=False) # Threshold of 10 is used in this case

high_vif

In [None]:
## Percentage of high multicollinearity features in the dataset
print(len(high_vif)/df_train.shape[1])

85.71% of the features are highly multicollinearity ! 

Fixing multicollinearity on these features are impossible due the insufficient data description of these, p1,p2 ... to p37 features.

## Data Processing
Combine both train and test together for data processing

In [None]:
## dependant vairable 
y = df_train["revenue"]
df_train = df_train.drop("revenue",axis=1)

## combining
df_all = pd.concat([df_train,df_test],axis=0)

## drop the "Id" column
df_all = df_all.drop("Id",axis=1)

## check on the shape
df_all.shape

### Imputing Missing Values
Check on the combined dataset for missing value as we only check for the train dataset only.

In [None]:
## Missing value
def missing_value(df):
    """ Takes in a dataframe and
    returns the number and percentage of the 
    missing value"""
    
    ## number of missing values
    number = df.isnull().sum().sort_values(ascending=False)
    number = number[number > 0]
    
    ## percentage of missing value
    percentage = df.isnull().sum()*100 /df.shape[0]
    percentage = percentage[percentage > 0].sort_values(ascending=False)
    
    return pd.concat([number,percentage],axis=1,keys=["Total","Percentage"])

missing_value(df_all)

Great ! Look's like there no missing values in the test dataset too.

## Normalization of Dependant Variable (revenue)

In [None]:
## import packages
import matplotlib.gridspec as gridspec
import scipy.stats as stats

## Visualization
fig = plt.figure(figsize=(12,8),constrained_layout=True)
grid = gridspec.GridSpec(ncols=3,nrows=4,figure=fig)
# Histrogram
ax1 = fig.add_subplot(grid[0,:])
sns.distplot(y,ax=ax1)
ax1.set_title("Histrogram of revenue",fontsize=10)

# Probability plot
ax2 = fig.add_subplot(grid[2:,:2])
stats.probplot(y,plot=ax2)
ax2.set_title("QQplot of revenue")

# Boxplot
ax3 = fig.add_subplot(grid[2:,2])
sns.boxplot(y,ax=ax3,orient="v")
ax3.set_title("Boxplot of revenue")

plt.show()


1. ****Skewness****: Defined as the degree of distortion from the symmetrical bell curve or the normal curve.
1. ****Kurtosis****: Defined as the measuer of the extreme values (also known as outliers) present in the distribution

In [None]:
## Check on the kurtosis and skewness of revenue
print("Kurtosis: {}".format(y.kurt()))
print("Skewness: {}".format(y.skew()))

The three grahps above show us:

* The revenue is drawn from a normal distribution
* The revenue is right skewed/ postively skewed, which indicates that most of the restaurants earn lesser.
* Present some outliers in revenue

As indicated in the three charts above, revenue is postively-skewed. revenue is drawn from a Leptokurtic (distributions with wider tails, greater profusion of outliers) distributions. 

Code below normalize the dependant variable.

In [None]:
## Normalization
y = np.log1p(y)

## Visualization
fig, ax = plt.subplots(1,2,constrained_layout=True,figsize=(12,8))

## Histrogram
sns.distplot(y,ax=ax[0])
ax[0].set_title("Histrogram of Normalized revenue",fontsize=10)

## QQplot
stats.probplot(y,plot=ax[1])
ax[1].set_title("Proability Plot of Normalized revenue",fontsize=10)

plt.show()

In [None]:
## Check on the normalized revenue kurtosis and skewness
print("Kurtosis: {}".format(y.kurt()))
print("Skewness: {}".format(y.skew()))

****Great!**** Our revenue feature has been normalized. 

## Adding New Features

In [None]:
## OpenMonth
df_all["OpenMonth"] = df_all["Open Date"].transform(lambda x: int(x.split("/")[0]))

## OpenYear
df_all["OpenYear"] = df_all["Open Date"].transform(lambda x: int(x.split("/")[-1]))

## Open Day
df_all["Open Day"] = df_all["Open Date"].transform(lambda x: int(x.split("/")[1]))

## Remove Open Date
df_all = df_all.drop(["Open Date"],axis=1)

## Eliminating Biased Features

In [None]:
## Bias feature reducer
bias_feat = []
for feat in df_all.columns:
    counts = df_all[feat].value_counts().iloc[0] # mode value count
    if counts*100 / len(df_all) >99.94:
        bias_feat.append(feat)

bias_feat

## Create Dummy Variables

In [None]:
## Dummy variable
df_all = pd.get_dummies(df_all).reset_index(drop=True)

In [None]:
## Split the dataset back into train and test dataset
n = len(y)

## train dataset
df_train = df_all[:n]

## test dataset
df_test = df_all[n:]

## Check on thier shapes
print("Shape of train dataset: {}".format(df_train.shape))
print("Shape of test dataset: {}".format(df_test.shape))

## Model Building
### Train Test Split
The dataset are split into X_train,X_test, y_train, y_test.

In [None]:
## import necessary package
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Split the data into train and test set
X_train, X_test, y_train, y_test =  train_test_split(df_train,y,test_size=0.33,random_state=42)


## Check on the dataset shape
print("Shapes: ", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## GridSearchCV Best Parameters for All Models

In [None]:
## create an empty list to contain all best models for later use
best_estimators = []

In [None]:
## import necessary packages
from sklearn.model_selection import GridSearchCV

## models packages
from sklearn.linear_model import Lasso, Ridge, ElasticNet, HuberRegressor, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, VotingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
import lightgbm as lgbm

### Ridge 

In [None]:
## Parameters
params = {
    "alpha" : [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20],
    "fit_intercept" : [True, False],
    "normalize" : [True,False],
    "solver" : ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    "tol" : [0.0001, 0.001, 0.01, 0.1],
    "random_state" : [42]
}

## Ridge
ridge = Ridge()
ridge_grid = GridSearchCV(ridge, params, scoring='r2', cv=7, n_jobs=-1)
ridge_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(ridge_grid.best_params_))
print("Best score: {}".format(ridge_grid.best_score_))

## Append to list
best_estimators.append(["Ridge",ridge_grid.best_estimator_])

### Lasso

In [None]:
## Parameters
params = {
    'alpha' : [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20],
    'fit_intercept' : [True, False],
    'normalize' : [True,False],
    'tol' : [0.0001, 0.001, 0.01, 0.1],
    "random_state" : [42]
}

## Lasso
lasso = Lasso()
lasso_grid = GridSearchCV(lasso, params, scoring='r2', cv=7, n_jobs=-1)
lasso_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(lasso_grid.best_params_))
print("Best score: {}".format(lasso_grid.best_score_))

## Append to list
best_estimators.append(["Lasso",lasso_grid.best_estimator_])

### Elastic Net

In [None]:
## Parameters
params = {
    "alpha" : [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20],
    "fit_intercept" : [True, False],
    "normalize" : [True,False],
    "tol" : [0.0001, 0.001, 0.01, 0.1],
    "random_state" : [42]
}

## Elastic Net
EL = ElasticNet()
EL_grid = GridSearchCV(EL, params, scoring='r2', cv=7, n_jobs=-1)
EL_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(EL_grid.best_params_))
print("Best score: {}".format(EL_grid.best_score_))

## Append to list
best_estimators.append(["ElasticNet",EL_grid.best_estimator_])

### HuberRegressor

In [None]:
## Parameters
params = {
    "alpha" : [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20],
    "fit_intercept" : [True, False],
    "tol" : [0.0001, 0.001, 0.01, 0.1],
    "max_iter": [100, 300 , 500]
}

## HuberRegressor
Huber_R = HuberRegressor()
Huber_R_grid = GridSearchCV(Huber_R, params, scoring='r2', cv=7, n_jobs=-1)
Huber_R_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(Huber_R_grid.best_params_))
print("Best score: {}".format(Huber_R_grid.best_score_))

## Append to list
best_estimators.append(["HuberRegressor",Huber_R_grid.best_estimator_])

### RandomForestRegressor

In [None]:
## Parameters
params = {
    "max_depth": ["None",10, 30, 50, 75, 100],
    "max_features": ["auto",0.3, 0.6],
    "min_samples_leaf": [1,3,5,7],
    "min_samples_split": [2, 4, 8, 12],
    "n_estimators": [30, 50, 100, 200],
    "random_state" : [42]
}

## RandomForestRegressor
RFR = RandomForestRegressor()
RFR_grid = GridSearchCV(RFR, params, scoring='r2', cv=7, n_jobs=-1)
RFR_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(RFR_grid.best_params_))
print("Best score: {}".format(RFR_grid.best_score_))

## Append to list
best_estimators.append(["RandomForestR",RFR_grid.best_estimator_])

### ExtraTreesRegressor

In [None]:
## Parameters
params = {
    "max_depth": ["None",10, 30, 50, 75, 100],
    "max_features": ["auto",.3, .4, .5, .6],
    "min_samples_leaf": [1,3,5,7],
    "min_samples_split": [2, 4, 8, 12],
    "n_estimators": [30, 50, 100, 200],
    "random_state" : [42]
}

## ExtraTreesRegressor
ETR = ExtraTreesRegressor()
ETR_grid = GridSearchCV(ETR, params, scoring='r2', cv=7, n_jobs=-1)
ETR_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(ETR_grid.best_params_))
print("Best score: {}".format(ETR_grid.best_score_))

## Append to list
best_estimators.append(["ExtraTreesR",ETR_grid.best_estimator_])

### BaggingRegressor

In [None]:
## Parameters
params = {
    "max_features": [0.2, 0.4, 0.6,1.0],
    "n_estimators": [5, 10, 15, 20],
    "random_state": [42]
}

## BaggingRegressor
BR =  BaggingRegressor()
BR_grid = GridSearchCV(BR, params, scoring='r2', cv=7, n_jobs=-1)
BR_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(BR_grid.best_params_))
print("Best score: {}".format(BR_grid.best_score_))

## Append to list
best_estimators.append(["BaggingRegressorR",BR_grid.best_estimator_])

### XGBoost Regressor

In [None]:
## parameters
params = {
    "learning_rate": [.1, .5, .7, .9, .95, .99, 1],
    "colsample_bytree": [.3, .4, .5, .6],
    "max_depth": [2, 4],
    "alpha": [1, 3, 5],
    "subsample": [.5],
    "n_estimators": [30, 70, 100, 200],
    "random_state" : [42]
}

## XGBoost Regressor
XGBR =  XGBRegressor()
XGBR_grid = GridSearchCV(XGBR, params, scoring='r2', cv=7, n_jobs=-1)
XGBR_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(XGBR_grid.best_params_))
print("Best score: {}".format(XGBR_grid.best_score_))

## Append to list
best_estimators.append(["XGBoostR",XGBR_grid.best_estimator_])

### DecisionTreeRegressor

In [None]:
## parameters
params = {
    "max_depth": ["None",10, 40, 80],
    "max_features": ["auto","sqrt","log2"],
    "min_samples_leaf": [1,3,5,7],
    "min_samples_split": [2, 6, 12],
    "random_state" : [42],
    "splitter" : ["best","random"]
}

## XGBoost Regressor
DTR =  DecisionTreeRegressor()
DTR_grid = GridSearchCV(DTR, params, scoring='r2', cv=7, n_jobs=-1)
DTR_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(DTR_grid.best_params_))
print("Best score: {}".format(DTR_grid.best_score_))

## Append to list
best_estimators.append(["DecisionTreeR",DTR_grid.best_estimator_])

### AdaBoostRegressor

In [None]:
## parameters
params = {
    "n_estimators": [10, 30, 50, 100],
    "learning_rate": [.01, 0.1, 0.5, 0.9, 0.95, 1],
    "random_state" : [42]
}

## XGBoost Regressor
AdaBoostR =   AdaBoostRegressor()
AdaBoostR_grid = GridSearchCV(AdaBoostR, params, scoring='r2', cv=7, n_jobs=-1)
AdaBoostR_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(AdaBoostR_grid.best_params_))
print("Best score: {}".format(AdaBoostR_grid.best_score_))

## Append to list
best_estimators.append(["AdaBoostR",AdaBoostR_grid.best_estimator_])

### GradientBoostRegressor

In [None]:
## Parameters
params = {
    "max_depth": [2, 3, 6, 10],
    "max_features": ["auto",0.3, 0.6],
    "min_samples_leaf": [1,3],
    "min_samples_split": [2, 5],
    "n_estimators": [30, 50, 100, 200],
    "random_state" : [42],
    "tol" : [0.0001, 0.001, 0.01, 0.1]
}

## GradientBoostingRegressor
GBR = GradientBoostingRegressor()
GBR_grid = GridSearchCV(GBR, params, scoring='r2', cv=7, n_jobs=-1)
GBR_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(GBR_grid.best_params_))
print("Best score: {}".format(GBR_grid.best_score_))

## Append to list
best_estimators.append(["GradientBoostR",GBR_grid.best_estimator_])

## Evaluate All Optimized Estimators


In [None]:
##import necessary packages
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## pipeline
pipelines = []

for name,model in best_estimators:
    pipeline = Pipeline([("Scaler",StandardScaler()),
                         (name,model)
                        ])
    pipelines.append(["Scaled_"+name,pipeline])

In [None]:
## import packages
from sklearn.model_selection import KFold, cross_val_score

## Create a dataframe to store all the models' cross validation score
evaluate = pd.DataFrame(columns=["model","cv","std"])


## Encoded dataset
for name,model in pipelines:
    kfold = KFold(n_splits=10,random_state=42)
    cv = cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=-1, scoring="neg_root_mean_squared_error")
    
    row = evaluate.shape[0]
    evaluate.loc[row,"model"] = name
    evaluate.loc[row,"cv"] = round(cv.mean(),3)
    evaluate.loc[row,"std"] = "+/- {}".format(round(cv.std(),4))
    
    evaluate = evaluate.sort_values("cv",ascending=False)

evaluate

In [None]:
## Visualization
fig, ax = plt.subplots(1,1,sharey=False,figsize=(16,9))


bar = sns.barplot(evaluate["model"], evaluate["cv"],ax=ax,palette = sns.cubehelix_palette(evaluate.shape[0]))
for rec in bar.patches:
    height = rec.get_height()
    ax.text(rec.get_x() + rec.get_width()/2, height-0.02,height,ha="center")
ax.set_title("Cross Validate Score")
ax.set_xticklabels(ax.get_xticklabels(),rotation = 45)

From the chart above, models with the top 5 best negative rmse score are selected. All these models would be use in the ensemble VotingRegressor to further increase the cv score.

These models are:

* RandomForestRegressor
* AdaBoostRegressor
* ExtraTreesRegressor
* BaggingRegressor
* GradientBoostRegressor

### Ensemble Models with VotingRegressor
A voting regressor is an ensemble meta-estimator that fits several base regressors, each on the whole dataset. Then it averages the individual predictions to form a final prediction

Note: VotingRegressor works well with models that are not identical, for example: ExtraTreesRegressor with AdaBoostRegressor. ExtraTreesRegressor and RandomTreesRegressor are highly identifical so we would just pick the one with a higher cv score (in this case RandomForestRegressor) 

So we gonna use just these 4 models :

* RandomForestRegressor
* AdaBoostRegressor
* BaggingRegressor
* GradientBoostRegressor

In [None]:
## Creating a list for all combinations models
votings = []

## RandomForestRegressor only, Current best model
votings.append(["Scaled_RFR",RFR_grid.best_estimator_])


##  All models
votings.append(("Scaled_all_models",Pipeline([("Scaler",StandardScaler()),
                                      ("Votings",VotingRegressor([("RFR",RFR_grid.best_estimator_),
                                                                  ("AdaBoostR", AdaBoostR_grid.best_estimator_),
                                                                  ("BaggingR",BR_grid.best_estimator_),
                                                                  ("GBR",GBR_grid.best_estimator_)
                                                                 ])
                                    
                                    )])))


### Combinations of three estimators

## Combination of RandomForestRegressor with  BaggingRegressor & GradientBoostRegressor
votings.append(("Scaled_RFR_BR_GBR",Pipeline([("Scaler",StandardScaler()),
                                      ("Votings",VotingRegressor([("RFR",RFR_grid.best_estimator_),
                                                                  ("BaggingR",BR_grid.best_estimator_),
                                                                  ("GBR",GBR_grid.best_estimator_)
                                                                 ])
                                    
                                    )])))

## Combination of RandomForestRegressor with BaggingRegressor & AdaBoostRegressor 
votings.append(("Scaled_RFR_BR_AdaBR",Pipeline([("Scaler",StandardScaler()),
                                      ("Votings",VotingRegressor([("RFR",RFR_grid.best_estimator_),
                                                                  ("BaggingR",BR_grid.best_estimator_),
                                                                  ("AdaBoostR", AdaBoostR_grid.best_estimator_)
                                                                 ])
                                    
                                    )])))

## Combination of RandomForestRegressor with AdaBoostRegressor  & GradientBoostRegressor 
votings.append(("Scaled_AdaBR_GBR",Pipeline([("Scaler",StandardScaler()),
                                      ("Votings",VotingRegressor([("RFR",RFR_grid.best_estimator_),
                                                                  ("AdaBoostR", AdaBoostR_grid.best_estimator_),
                                                                  ("GBR",GBR_grid.best_estimator_)
                                                                 ])
                                    
                                    )])))

## Combination of BaggingRegressor with GradientBoostRegressor & AdaBoostRegressor
votings.append(("Scaled_BR_GBR_AdaBR",Pipeline([("Scaler",StandardScaler()),
                                      ("Votings",VotingRegressor([("BaggingR",BR_grid.best_estimator_),
                                                                  ("GBR",GBR_grid.best_estimator_),
                                                                  ("AdaBoostR", AdaBoostR_grid.best_estimator_)
                                                                 ])
                                    
                                    )])))

### Combination of 2 estimators

## Combination of BaggingRegressor with GradientBoostRegressor
votings.append(("Scaled_BR_GBR",Pipeline([("Scaler",StandardScaler()),
                                      ("Votings",VotingRegressor([("BaggingR",BR_grid.best_estimator_),
                                                                  ("GBR",GBR_grid.best_estimator_)
                                                                 ])
                                    
                                    )])))

## Combination of BaggingRegressor with AdaBoostRegressor
votings.append(("Scaled_BR_AdaBR",Pipeline([("Scaler",StandardScaler()),
                                      ("Votings",VotingRegressor([("BaggingR",BR_grid.best_estimator_),
                                                                  ("AdaBoostR", AdaBoostR_grid.best_estimator_)
                                                                 ])
                                    
                                    )])))

## Combination of BaggingRegressor with RandomForestRegressor
votings.append(("Scaled_BR_RFR",Pipeline([("Scaler",StandardScaler()),
                                      ("Votings",VotingRegressor([("BaggingR",BR_grid.best_estimator_),
                                                                  ("RFR",RFR_grid.best_estimator_)
                                                                 ])
                                    
                                    )])))

## Combination of GradientBoostRegressor with AdaBoostRegressor
votings.append(("Scaled_GBR_AdaBR",Pipeline([("Scaler",StandardScaler()),
                                      ("Votings",VotingRegressor([("GBR",GBR_grid.best_estimator_),
                                                                  ("AdaBoostR", AdaBoostR_grid.best_estimator_)
                                                                 ])
                                    
                                    )])))

## Combination of GradientBoostRegressor with RandomForestRegressor
votings.append(("Scaled_GBR_RFR",Pipeline([("Scaler",StandardScaler()),
                                      ("Votings",VotingRegressor([("GBR",GBR_grid.best_estimator_),
                                                                  ("RFR",RFR_grid.best_estimator_)
                                                                 ])
                                    
                                    )])))

## Combination of AdaBoostRegressor with RandomForestRegressor
votings.append(("Scaled_AdaBR_RFR",Pipeline([("Scaler",StandardScaler()),
                                      ("Votings",VotingRegressor([("AdaBoostR", AdaBoostR_grid.best_estimator_),
                                                                  ("RFR",RFR_grid.best_estimator_)
                                                                 ])
                                    
                                    )])))

In [None]:
## Create dataframe for the cross validate score
evaluate_vote = pd.DataFrame(columns=["model","cv","std"])

## Fitting all the combination models
for name, model in votings:
    kfold = KFold(n_splits=10,random_state=42)
    
    cv = cross_val_score(model,X_train,y_train, cv=kfold, scoring="neg_root_mean_squared_error",n_jobs=-1)
    
    row = evaluate_vote.shape[0]
    
    evaluate_vote.loc[row,"model"] = name
    evaluate_vote.loc[row,"cv"] = round(cv.mean(),4)
    evaluate_vote.loc[row,"std"] = "+- {}".format(round(cv.std(),5))
    
evaluate_vote = evaluate_vote.sort_values("cv",ascending=False)
evaluate_vote

In [None]:
## Visualization
fig, ax = plt.subplots(1,1,figsize=(16,9))
bar = sns.barplot(evaluate_vote["model"],evaluate_vote["cv"],ax=ax,palette = sns.cubehelix_palette(evaluate.shape[0]))

for rec in bar.patches:
    height = rec.get_height()
    ax.text(rec.get_x() + rec.get_width() /2, height *1.02, height, ha="center")
ax.set_title("Cross Validate Score",fontsize=14)
ax.set_xticklabels(evaluate_vote["model"].to_list(),rotation=45)

It seems like RandomForestRegressor with standardized dataset perform the best compared to all of the ensemble estimators. We will use RandomForestRegressor estimator to predict the test dataset for our submission.

## Submission

In [None]:
## Best Model : Scaled_RFR
best_model = Pipeline([("Scaler",StandardScaler()),
                       ("RFR",RFR_grid.best_estimator_)
                      ])
## Fit the model 
best_model = best_model.fit(df_train,y) # fit the model with all the train dataset

## Submission
submission = pd.read_csv("/kaggle/input/restaurant-revenue-prediction/sampleSubmission.csv")
submission.iloc[:,1] = np.floor(np.expm1(best_model.predict(df_test)))
submission.to_csv('submission', index=False)
submission

### It's been my pleasure to share my notebook with you.
### If this notebook do helped you in any way, please hit that "upvote" button. Thank you !