In [None]:
import numpy as np 
import pandas as pd 
import os
import seaborn as sns
import matplotlib.pyplot as plt

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Read train dataset
df_train = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/train.csv.zip')
df_test = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/test.csv.zip')
# print(df_train.info())
df_train.head()
df_test.info()

Training data has -
Open date - the date when restaurant was opened (Date - dd-mm-yyyy)
City - The city it was in (Categorical data) 
City - group (Categorical data)
Type - Type of the restaurant. FC: Food Court, IL: Inline, DT: Drive Thru, MB: Mobile (Categorical data)
P1 - P37 - Obscure data
revenue - revenue of the restaurant in a given year (Float)

There are no null values in train data and test data.

In [None]:
# Data type conversion
df_train = df_train.astype({"City":'category',"City Group":'category',"Type":'category', "Open Date":'datetime64[ns]'})
# df_train.info()
df_test = df_test.astype({"City":'category',"City Group":'category',"Type":'category', "Open Date":'datetime64[ns]'})
# df_test.info()

Outline :
1. Check categories in train and test data
2. Choose 1 category and assign label which matches in both train and test set.
3. See how to handle categorical data in prediction
4. Do corelation of obscure data with revenue 
5. Create training and test set
6. Put it in different prediction models 
7. Get rmse value

In [None]:
# 1. Check categories in train and test data
# df_train : City, City group, Type
cities = df_train.City.unique().tolist()
print(len(cities))
city_group = df_train['City Group'].unique().tolist()
print(city_group)
type_ = df_train['Type'].unique().tolist()
print(type_)

In [None]:
cities = df_test.City.unique().tolist()
print(len(cities))
city_group = df_train['City Group'].unique().tolist()
print(city_group)
type_ = df_train['Type'].unique().tolist()
print(type_)

Observations :
Cities column in train data is insuffucient as test data has more categories than train. We will drop it
City group and type has the same categories in both train and test data.

In [None]:
# 2. Choose 1 category and assign label which matches in both train and test set.
# To choose among city and type - Convert type to numerical

df_train['City Group'] = df_train[['City Group']].apply(lambda x: x.cat.codes)
df_train['Type'] = df_train[['Type']].apply(lambda x: x.cat.codes)

# # # Correlation for train set
# plt.rcParams["figure.figsize"] = (20,10)
# sns.heatmap(df_train[['City Group','Type','revenue']].corr(), annot=True)
# plt.show()

from sklearn.tree import DecisionTreeRegressor

df_train = df_train.drop(columns = ['Id','Open Date','City'],axis=1)
X = df_train.loc[:, df_train.columns != 'revenue']
y = df_train.loc[:, df_train.columns == 'revenue']

tree = DecisionTreeRegressor().fit(X, y)
# print(X.columns)
# np.round(tree.feature_importances_, 3)

# Feature importance dataframe
d = {'Importance': np.round(tree.feature_importances_, 3) , 'Features': X.columns}
feature_imp = pd.DataFrame(data=d)
# feature_imp

# Sort
feature_imp_order = feature_imp.sort_values('Importance', ascending=False).reset_index(drop=True)[0:20]
feature_imp_order

According to feature importance order we will choose the first 16 highest ranked paramters for predicting revenue.

In [None]:
# Training
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error


X_main = df_train[feature_imp_order.Features.values]
y_main = df_train['revenue']
scaler_train = StandardScaler()
scaler_train.fit(X_main)
X_main_scaled = scaler_train.transform(X_main)

# Splitting the data into training and testing data
linear_regr = LinearRegression()
linear_regr.fit(X_main_scaled, y)
y_pred = linear_regr.predict(X_main_scaled)
accuracy = linear_regr.score(X_main_scaled,y)
print("Train Accuracy {}%".format(int(round(accuracy *100))))
print("Training RMSE Linear regression ",mean_squared_error(y, y_pred, squared=False))

from sklearn.svm import SVR
svm_regr = SVR(kernel = 'rbf')
svm_regr.fit(X_main_scaled, y)
y_pred = svm_regr.predict(X_main_scaled)
accuracy = svm_regr.score(X_main_scaled,y)
print("Train Accuracy {}%".format(int(round(accuracy *100))))
print("Training RMSE SVM regressor ",mean_squared_error(y, y_pred, squared=False))

# import the regressor
from sklearn.tree import DecisionTreeRegressor 
  
# create a regressor object
dt_regr = DecisionTreeRegressor(random_state = 0) 
  
# fit the regressor with X and Y data
dt_regr.fit(X_main_scaled, y)
y_pred = dt_regr.predict(X_main_scaled)
accuracy = dt_regr.score(X_main_scaled,y)
print("Train Accuracy {}%".format(int(round(accuracy *100))))
print("Training RMSE SVM regressor ",mean_squared_error(y, y_pred, squared=False))


In [None]:
# Prepare test data
df_test
df_test['City Group'] = df_test[['City Group']].apply(lambda x: x.cat.codes)
df_test['Type'] = df_test[['Type']].apply(lambda x: x.cat.codes)

result_df_linear_regr = pd.DataFrame()
result_df_linear_regr['Id'] = df_test['Id'].values

result_df_svm_regr = pd.DataFrame()
result_df_svm_regr['Id'] = df_test['Id'].values

result_df_dt_regr = pd.DataFrame()
result_df_dt_regr['Id'] = df_test['Id'].values

df_test = df_test[feature_imp_order.Features.values]

X_test = df_test.loc[:, df_test.columns != 'revenue']
# y_test = df_test.loc[:, df_train.columns == 'revenue']

scaler_test = StandardScaler()
scaler_test.fit(X_test)
X_test_scaled = scaler_test.transform(X_test)

y_pred_linear_regr = linear_regr.predict(X_test_scaled)
result_df_linear_regr['Prediction'] = y_pred_linear_regr
result_df_linear_regr['Prediction'] = result_df_linear_regr['Prediction'].apply(lambda x:round(x,2))
result_df_linear_regr

y_pred_svm_regr = svm_regr.predict(X_test_scaled)
result_df_svm_regr['Prediction'] = y_pred_svm_regr
result_df_svm_regr['Prediction'] = result_df_svm_regr['Prediction'].apply(lambda x:round(x,2))
result_df_svm_regr

y_pred_dt_regr = dt_regr.predict(X_test_scaled)
result_df_dt_regr['Prediction'] = y_pred_dt_regr
result_df_dt_regr['Prediction'] = result_df_dt_regr['Prediction'].apply(lambda x:round(x,2))
result_df_dt_regr.to_csv('Prediction_DT.csv',index=False)
result_df_dt_regr