In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!unzip /kaggle/input/restaurant-revenue-prediction/test.csv.zip -d test

In [None]:
!unzip /kaggle/input/restaurant-revenue-prediction/train.csv.zip -d train

In [None]:
import pandas as pd 

train_data = pd.read_csv('train/train.csv',index_col=0)
train_data.head()

# Exploratory Data Analysis

In [None]:
train_data.describe()

In [None]:
train_data.shape

In [None]:
train_data.columns

In [None]:
train_data.dtypes

There are mainly 2 categorical variables namely : 
> * City Group  
> *Type (Type of the restaurant. FC: Food Court, IL: Inline, DT: Drive Thru, MB: Mobile)

And 37 Numerical variables (discrete)
> * P1 to P37

In [None]:
# lets check which city has maximum number of restaurants
train_data["City"].value_counts()

> Istanbul has Maximum number of restaurants = 50


 Visualizing the data for finding some insights

In [None]:
import matplotlib.pyplot as plt

In [None]:
# lets check how city affects our revenue feature
plt.subplots(figsize=(30,10))
city_revenue_group = train_data["revenue"].groupby(train_data["City"])
agg_data = city_revenue_group.sum()
x_axis = agg_data.index
y_axis = agg_data
plt.bar(x_axis,y_axis)
plt.xlabel("City")
plt.ylabel("Revenue")
plt.show()

>Istanbul is mainly generating exceptionally high revenue as compared to other cities

In [None]:
# lets check how city groups affects our revenue feature
city_group_revenue_group = train_data["revenue"].groupby(train_data["City Group"])
agg_data = city_group_revenue_group.sum()
x_axis = agg_data.index
y_axis = agg_data
plt.bar(x_axis,y_axis)
plt.xlabel("City Group")
plt.ylabel("Revenue")
plt.show()

>Restaurants in  Big Cities are generating  higher revenue


In [None]:
# lets check how Type affects our revenue feature
type_revenue_group = train_data["revenue"].groupby(train_data["Type"])
agg_data = type_revenue_group.sum()
x_axis = agg_data.index
y_axis = agg_data
plt.bar(x_axis,y_axis)
plt.xlabel("Type")
plt.ylabel("Revenue")
plt.show()

> * FC Type is generating maximum revenue.
> *IL closly competing with FC type
> *DT is almost generating insignificant amount of revenue.
> *MB Feature is not present at all

In [None]:
# visualizing remaining features , looking for correlation between them
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()
fig, ax = plt.subplots(figsize=(30,30)) 
correlation_matrix = train_data.corr()
sns.heatmap(correlation_matrix,annot=True,linewidths=.5,ax=ax)

We can see that there is a high correlation among some feature

Lets find VIF of the features:

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# exluding revenue from VIF calculation because it's variable to be predicted
features=train_data.loc[:,"P1":"P37"]
vif_data = pd.DataFrame()
vif_data["features"] = features.columns
vif_data["vif"] = [variance_inflation_factor(features.values, i) for i in range(len(features.columns))]
vif_data = vif_data.sort_values(by=["vif"])
vif_data

We can see that there exits high multicollinearity in our data.
* We don't know what is the source of this data in this case. Data in columns P1 - P37 is divided in three categories of ***`obfuscated data.`***
>* Demographic data are gathered from third party providers with GIS systems. These include population in any given area, age and gender distribution, development scales.
>* Real estate data mainly relate to the m2 of the location, front facade of the location, car park availability.
>* Commercial data mainly include the existence of points of interest including schools, banks, other QSR operators.


**Since the data in obfuscated, we can't clearly identify change in which feature brings a change in revenue.**


By Observing the Heat Map we can say that P2, P6 and P28 have a signicant correlation with revenue as compared to others

In [None]:
#Let check correlation between P2-revenue,P6-revenue,P28-revenue 
plt.figure(1)
plt.xlabel("P2")
plt.ylabel("revenue")
plt.scatter(train_data["P2"],train_data["revenue"])
plt.figure(2)
plt.xlabel("P6")
plt.ylabel("revenue")
plt.scatter(train_data["P6"],train_data["revenue"])
plt.figure(3)
plt.xlabel("P28")
plt.ylabel("revenue")
plt.scatter(train_data["P28"],train_data["revenue"])


# 2. Data Preprocessing 

As we have seen above "Istanbul" is the only city that has maximum number of restaurants. Most of the other restaurants have significantly less number of restaurants. We can't have one hot encoding for each and every city, it will make so many features

So I will divide the city restaurants into different groups 
> * All cities having more than 3 restaurants will have individual group 
> * Rest all with be put under the group "other".

In [None]:
# cities with no. of restaurants > 3
train_data["City"].value_counts() > 3

* İstanbul 
* Ankara   
* İzmir    
* Samsun   
* Bursa    
* Sakarya  
* Antalya  
will be encoded as individual columns , rest will be put in "others group"

In [None]:
# one hot encoding all the major cities (where no. of restaurants are more than 3)
city_encodings = pd.get_dummies(train_data[["City"]], prefix = ['City'])
city_encodings["City_Other"] = 0
for index, rows in city_encodings.iterrows():
    if (rows["City_İstanbul"] == 0 and rows["City_Ankara"] == 0 and rows["City_İzmir"] == 0 and rows["City_Bursa"] == 0 and rows["City_Samsun"] == 0 and rows["City_Antalya"] == 0 and rows["City_Sakarya"] == 0):
        city_encodings["City_Other"][index] = 1

# chosing essential groups i.e [Cities with no. of restaurants > 3 and Other (with no. of restaurants < 3)] 
city_encodings = city_encodings[["City_İstanbul", "City_Ankara", "City_İzmir", "City_Bursa", "City_Samsun", "City_Antalya", "City_Sakarya", "City_Other"]]
city_encodings

In [None]:
train_data = pd.merge(train_data, city_encodings, left_index = True, right_index = True)

In [None]:
train_data.drop(["City"],axis=1,inplace=True)
train_data.head()

After grouping Cities, lets group City Group feature

In [None]:
# one hot encoding City Groups
city_group_encodings = pd.get_dummies(train_data[["City Group"]], prefix = ['City Group'])
city_group_encodings

In [None]:
train_data = pd.merge(train_data, city_group_encodings, left_index = True, right_index = True)

In [None]:
train_data.drop(["City Group"], axis=1,inplace=True)

In [None]:
train_data.head()

After grouping Cities, lets group Type feature

> Since there is no MB feature and very less DT feature values. Lets combine them and make them one as "Other_Type"

In [None]:
type_encodings = pd.get_dummies(train_data[["Type"]], prefix = ['Type'])
type_encodings["Type_Other"] = 0
for index, rows in type_encodings.iterrows():
    if (rows["Type_DT"] == 0 and rows["Type_FC"] == 0):
        type_encodings["Type_Other"][index] = 1
type_encodings = type_encodings[["Type_DT","Type_FC","Type_Other"]]
type_encodings

In [None]:
train_data = pd.merge(train_data, type_encodings, left_index = True, right_index = True)

In [None]:
train_data.drop(["Type"],axis=1,inplace=True)

In [None]:
train_data.head()

In [None]:
#removing open date
train_data.drop(["Open Date"],axis=1,inplace=True)

In [None]:
train_data.head()

In [None]:
# treating obfuscated data 
# keeping P2,P6 and P28 and removing rest all unnecesasry features from train_data
train_data.drop(["P1","P3","P4","P5","P7",	"P8",	"P9",	"P10",	"P11",	"P12",	"P13",	"P14",	"P15",	"P16",	"P17",	"P18",	"P19",	"P20",	"P21",	"P22",	"P23",	"P24",	"P25","P26","P27","P29",	"P30",	"P31",	"P32",	"P33",	"P34",	"P35",	"P36",	"P37"],axis=1,inplace=True)

In [None]:
train_data.head()

>from the scatter plots above we had observed there were outliers in the data. Revenue above 1.25 * 10^7  in all the 3 graphs were the outliers. 

In [None]:
train_data[train_data["revenue"] > 12500000].index

In [None]:
train_data.drop(train_data[train_data["revenue"] > 12500000].index, inplace=True)

In [None]:
plt.figure(1)
plt.xlabel("P2")
plt.ylabel("revenue")
plt.scatter(train_data["P2"],train_data["revenue"])
plt.figure(2)
plt.xlabel("P6")
plt.ylabel("revenue")
plt.scatter(train_data["P6"],train_data["revenue"])
plt.figure(3)
plt.xlabel("P28")
plt.ylabel("revenue")
plt.scatter(train_data["P28"],train_data["revenue"])

In [None]:
train_data.head()

#3. Model Training

In [None]:
train_data.shape

dividing 134 rows in 80:20 ratio for X_Train and X_Val
* 80% of 134 -> 107
* 20% of 134 -> 27

In [None]:
Y_Train = train_data["revenue"]
X_Train = train_data.drop(["revenue"],axis=1)

In [None]:
X_Train.head(), X_Train.shape

In [None]:
Y_Train.head(), Y_Train.shape

In [None]:
import tensorflow as tf

In [None]:
X_Train_Tensor = tf.constant(X_Train)
X_Train_Tensor

In [None]:
Y_Train_Tensor = tf.constant(Y_Train)
Y_Train_Tensor

In [None]:
Y_Train_Tensor = tf.reshape(Y_Train_Tensor,shape=(134,1))
Y_Train_Tensor

In [None]:
tf.random.set_seed(42)

In [None]:
X_Train_Tensor.shape

In [None]:
model = tf.keras.Sequential([tf.keras.layers.Dense(16,input_shape=(16,),activation='relu',activity_regularizer=tf.keras.regularizers.L1(0.01)),
                             tf.keras.layers.Dense(32,activation='relu',activity_regularizer=tf.keras.regularizers.L1(0.01)),
                             tf.keras.layers.Dense(32,activation='relu',activity_regularizer=tf.keras.regularizers.L1(0.01)),
                             tf.keras.layers.Dense(32,activation='relu',activity_regularizer=tf.keras.regularizers.L1(0.01)),
                             tf.keras.layers.Dense(13,activation='relu',activity_regularizer=tf.keras.regularizers.L1(0.01)),
                             tf.keras.layers.Dense(16,activation='relu',activity_regularizer=tf.keras.regularizers.L1(0.01)),
                             tf.keras.layers.Dense(16,activation='relu',activity_regularizer=tf.keras.regularizers.L1(0.01)),
                             tf.keras.layers.Dense(1,activation=None)
                            ])

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.mse
)

In [None]:
model.summary()

In [None]:
history = model.fit(X_Train_Tensor,Y_Train_Tensor,batch_size=128,epochs=2000,validation_split = 0.2)

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.head()

In [None]:
def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('Error [revenue]')
  plt.legend()
  plt.grid(True)

In [None]:
plot_loss(history)

### Let's visualize how much difference is there between Observed values and Predicted Values. 


In [None]:
Y_Pred = model.predict(X_Train_Tensor)

In [None]:
plt.figure(figsize=(30,10))
plt.plot(Y_Pred)
plt.plot(Y_Train_Tensor)
plt.legend(["Y_Pred","Y_Train_Tensor"])


In [None]:
test_data = pd.read_csv('test/test.csv',index_col=0)
test_data.head()

# Preprocessing Test data for getting predictions 

In [None]:
# one hot encoding all the major cities (where no. of restaurants are more than 3)
city_encodings = pd.get_dummies(test_data[["City"]], prefix = ['City'])
city_encodings["City_Other"] = 0
for index, rows in city_encodings.iterrows():
    if (rows["City_İstanbul"] == 0 and rows["City_Ankara"] == 0 and rows["City_İzmir"] == 0 and rows["City_Bursa"] == 0 and rows["City_Samsun"] == 0 and rows["City_Antalya"] == 0 and rows["City_Sakarya"] == 0):
        city_encodings["City_Other"][index] = 1

# chosing essential groups i.e [Cities with no. of restaurants > 3 and Other (with no. of restaurants < 3)] 
city_encodings = city_encodings[["City_İstanbul", "City_Ankara", "City_İzmir", "City_Bursa", "City_Samsun", "City_Antalya", "City_Sakarya", "City_Other"]]
city_encodings

In [None]:
test_data = pd.merge(test_data, city_encodings, left_index = True, right_index = True)

In [None]:
# one hot encoding City Groups
city_group_encodings = pd.get_dummies(test_data[["City Group"]], prefix = ['City Group'])
city_group_encodings

In [None]:
test_data = pd.merge(test_data, city_group_encodings, left_index = True, right_index = True)

In [None]:
test_data.drop(["City Group"], axis=1,inplace=True)

In [None]:
type_encodings = pd.get_dummies(test_data[["Type"]], prefix = ['Type'])
type_encodings["Type_Other"] = 0
for index, rows in type_encodings.iterrows():
    if (rows["Type_DT"] == 0 and rows["Type_FC"] == 0):
        type_encodings["Type_Other"][index] = 1
type_encodings = type_encodings[["Type_DT","Type_FC","Type_Other"]]
type_encodings

In [None]:
test_data = pd.merge(test_data, type_encodings, left_index = True, right_index = True)

In [None]:
test_data.drop(["Type"], axis=1,inplace=True)

In [None]:
test_data.head()

In [None]:
test_data.drop(["City","Open Date"],axis=1,inplace=True)

In [None]:
test_data.drop(["P1","P3","P4","P5","P7",	"P8",	"P9",	"P10",	"P11",	"P12",	"P13",	"P14",	"P15",	"P16",	"P17",	"P18",	"P19",	"P20",	"P21",	"P22",	"P23",	"P24",	"P25","P26","P27","P29",	"P30",	"P31",	"P32",	"P33",	"P34",	"P35",	"P36",	"P37"],axis=1,inplace=True)

In [None]:
test_data.head()

In [None]:
test_data.shape

In [None]:
X_Test_Tensor = tf.constant(test_data)

In [None]:
X_Test_Tensor.shape

In [None]:
Y_Predictions = model.predict(X_Test_Tensor)

In [None]:
Y_Predictions

In [None]:
test_data["Predictions"] = Y_Predictions

In [None]:
test_data.head()

In [None]:
submit_dataFrame = test_data[["Predictions"]]
# submit_dataFrame["Id"] = test_data.index
# submit_dataFrame["Prediction"] = Y_Predictions.reshape(100000,)
print(submit_dataFrame)
submit_dataFrame.to_csv("submission.csv")