In [None]:
import random 
random.seed(123)

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
import sklearn.metrics as skm
from sklearn.model_selection import train_test_split
import operator as op

import seaborn as sns
sns.set(rc={'figure.figsize': (12,8)})

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Importing 

In [None]:
df = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/train.csv.zip')
df.head()

In [None]:
# df_test = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/test.csv.zip')
# df_test.head()

In [None]:
# General info on the dataset
print(df.info())

In [None]:
# Count the number of null values
df.isnull().sum()

In [None]:
df.describe()

# Exploratory Data Analysis

In [None]:
# Encoding Categorical Variable
cat_vars = ['City', 'City Group', 'Type']
for i in cat_vars:
    df[i+"_cat"] = df[i].astype('category').cat.codes
# df.drop(cat_vars, axis=1, inplace=True)
df.head()

In [None]:
# Creating a plot for the correlation of features to the target variable
fig = plt.figure(figsize=(20,16))
target_corr = df[df.columns[1:]].corr()['revenue']
order_corr = target_corr.sort_values()
y = pd.DataFrame(order_corr).index[:-1]
x = pd.DataFrame(order_corr).revenue[:-1]
sns.barplot(x, y, orient='h')
plt.show()

In [None]:
# Distribution of target variable (revenue) by Type and by City Group
fig = plt.figure(figsize=(12,10))
gs = fig.add_gridspec(1, 2, hspace=0.7, wspace=0.1)
(ax1, ax2) = gs.subplots(sharex='col', sharey='row')
fig.suptitle('Distributions of Total Revenue')
fig.subplots_adjust(top=0.85)

sns.histplot(data=df, x='revenue', bins=25,hue='Type', ax=ax1)
sns.histplot(data=df, x='revenue', bins=25,hue='City Group', ax=ax2)

plt.show()

In [None]:
# Time Series of revenues generated
df_timeseries = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/train.csv.zip',
                             parse_dates=['Open Date'],
                             index_col= ['Open Date'],
                             na_values=['999.99'])
sns.lineplot(data=df_timeseries, x='Open Date', y='revenue')
plt.show()

# Feature Transformation

In [None]:
# Changing the open_date into datetime
# df['Open Date'] = pd.to_datetime(df['Open Date'])

In [None]:
# Checking the options in both city, city_group, and Type
# to evaluate if categorical encoding may be necessary
print(df.City.unique(), len(df.City.unique()))
print(df['City Group'].unique(), len(df['City Group'].unique()))
print(df['Type'].unique(), len(df['Type'].unique()))

In [None]:
def encode_and_bind(df, feature):
    dummies = pd.get_dummies(df[[feature]], prefix='')
    return pd.concat([df, dummies], axis=1)
df = encode_and_bind(df, 'City')
df = encode_and_bind(df, 'City Group')
df = encode_and_bind(df, 'Type')
df.head(3)

# Model Building

In [None]:
df.head()

In [None]:
# Features and Target 
X = df.drop(['revenue', 'Open Date', 'City', 'City Group', 'Type'], axis=1).values
y = df['revenue'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=202)

## 1- Single Models

In [None]:
# Multiple Linear Regression
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
rmse = skm.mean_squared_error(y_test, y_pred, squared=False)
print("RMSE: {0}".format(rmse))

In [None]:
# Single Tree Regression
reg = DecisionTreeRegressor(max_depth=10)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
rmse = skm.mean_squared_error(y_test, y_pred, squared=False)
print("RMSE: {0}".format(rmse))

In [None]:
# SVR - linear
reg = SVR(kernel='linear', C=100, gamma='auto')
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
rmse = skm.mean_squared_error(y_test, y_pred, squared=False)
print("RMSE: {0}".format(rmse))

In [None]:
# SVR - rbf
reg = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
rmse = skm.mean_squared_error(y_test, y_pred, squared=False)
print("RMSE: {0}".format(rmse))

In [None]:
# SVR - poly
reg = SVR(kernel='poly', C=100, gamma='auto', degree=2, epsilon=.1,
               coef0=1)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
rmse = skm.mean_squared_error(y_test, y_pred, squared=False)
print("RMSE: {0}".format(rmse))

## 2 - Ensemble Learning

In [None]:
# Random Forest
reg = RandomForestRegressor(n_estimators=1000, criterion="mse")
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
rmse = skm.mean_squared_error(y_test, y_pred, squared=False)
print("RMSE: {0}".format(rmse))

In [None]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor
reg = GradientBoostingRegressor(random_state=0)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
rmse = skm.mean_squared_error(y_test, y_pred, squared=False)
print("RMSE: {0}".format(rmse))

In [None]:
# Ada Boosting
from sklearn.ensemble import AdaBoostRegressor
reg = AdaBoostRegressor(random_state=0, n_estimators=1000)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
rmse = skm.mean_squared_error(y_test, y_pred, squared=False)
print("RMSE: {0}".format(rmse))

In [None]:
from sklearn.ensemble import BaggingRegressor
reg = BaggingRegressor(base_estimator=SVR(),
                        n_estimators=1000, random_state=0)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
rmse = skm.mean_squared_error(y_test, y_pred, squared=False)
print("RMSE: {0}".format(rmse))

### What's next? Putting all the modeling into 4 respective functions for split/cv for single and ensemble learning methods.