In [None]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns
import os
from math import sqrt

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
PATH='/kaggle/input/restaurant-revenue-prediction'

train_df=pd.read_csv(os.path.join(PATH,'train.csv.zip'))
test_df=pd.read_csv(os.path.join(PATH,'test.csv.zip'))

In [None]:
print('Train Data Shape:',train_df.shape)
print('Test Data Shape:',test_df.shape)
print('Features:',train_df.columns)

In [None]:
train_df.head()

In [None]:
sns.distplot(train_df['revenue'],hist=False)
plt.title('Target Variable Distribution')
plt.show()

The target variable is left skewed

### EDA

In [None]:
train_df.isnull().sum()

No Null Values are present in the train dataframe 

#### Instead of storing the complete open date we can store only open month and year 

In [None]:
def get_month(date):
    return int(date.split('/')[0])

def get_year(date):
    return int(date.split('/')[-1])
    
train_df['Month']=train_df['Open Date'].apply(get_month)
train_df['Year']=train_df['Open Date'].apply(get_year)

In [None]:
test_df['Month']=test_df['Open Date'].apply(get_month)
test_df['Year']=test_df['Open Date'].apply(get_year)

In [None]:
train_df.head()

In [None]:
print(train_df['Id'].shape) # the id has all unique values hence removing it
train_df.drop('Id',axis=1,inplace=True)
test_indexes=test_df['Id']
test_df.drop('Id',axis=1,inplace=True)

### Analysing the month feature

In [None]:
plt.figure(figsize=(10,5))

sns.countplot(x='Month',data=train_df)
plt.xlabel('Opening Month')
plt.ylabel('Openings')
plt.title('No of openings per month')
plt.show()

Maximum Restaunrants Openend in the month of August & December

In [None]:
plt.figure(figsize=(13,5))
months_revenue_mean=train_df.groupby('Month')['revenue'].mean()
sns.pointplot(x=months_revenue_mean.index,y=months_revenue_mean.values)
plt.title('Revenue Vs Month')
plt.show()

The Revenues are highest for the months January and September while for the months of May June and July the restaurants have the lowest revenues. 

In [None]:
(train_df['Month']=='05').sum()
(train_df['Month']=='06').sum()
(train_df['Month']=='07').sum()

### Analysing the year feature

In [None]:
plt.figure(figsize=(13,5))
sns.countplot(x='Year',data=train_df)
plt.ylabel('Number of Openings')
plt.title('Number Of Openings Per Year')
plt.show()

Maximum Number of Restaurants Openend in the year 2011

In [None]:
plt.figure(figsize=(14,5))
year_revenue_means=train_df.groupby('Year')['revenue'].mean()
sns.pointplot(year_revenue_means.index,year_revenue_means.values)
plt.xlabel('Revenue')
plt.ylabel('Year')
plt.title('Revenue Per Year')
plt.show()

<pre>
1) The Year 1999 and 2000 witnesed the highest revenues while there is a constant decrease in revenue from year 2007.
2) The year 2013 and 14 had the lowest revenues which might be due to lesser number of samples.
</pre>


In [None]:
print('Datapoints in Year 2013:',(train_df['Year']=='2013').sum())
print('Datapoints in Year 2014:',(train_df['Year']=='2014').sum())

So 2013 can be considered as the worst year for restaurants because the revenue is lowest even after having considerable data points.

### Analysing City Group Feature

In [None]:
print("City Group Categoies:",train_df['City Group'].unique())

In [None]:
sns.countplot('City Group', data=train_df)
plt.title('City Group Counts')
plt.show()

In [None]:
train_df['City Group'].value_counts()

In [None]:
city_group_revenue_means=train_df.groupby('City Group')['revenue'].sum()
city_group_revenue_means

In [None]:
sns.lineplot(x='City Group',y='revenue',data=train_df)

The Cities With Type as Big Cities have higher revenues and the rest are named as others 

In [None]:
## converting it into dummies

city_group_dummies=pd.get_dummies(train_df['City Group'])
train_df=pd.concat([train_df,city_group_dummies],axis=1)

In [None]:
test_city_group_dummies=pd.get_dummies(test_df['City Group'])
test_df=pd.concat([test_df,test_city_group_dummies],axis=1)

In [None]:
train_df.head()

### Analysing the type feature

In [None]:
print('Tyes in train df:',train_df['Type'].unique())
print('Types in test df:',test_df['Type'].unique())

In [None]:
fig,ax = plt.subplots(1,2,figsize=(9,5))
sns.countplot(train_df.Type,ax=ax[0])
ax[0].set_title('Train set')
sns.countplot(test_df.Type,ax=ax[1])
ax[1].set_title('Test set')
plt.show()

In the train set we don't have even 1 observation with Type as MB which might be a problem

In [None]:
type_map={'IL':0,'FC':1,'DT':2,'MB':3}
train_df['Type']=train_df['Type'].apply(lambda type:type_map[type])
test_df['Type']=test_df['Type'].apply(lambda type:type_map[type])

In [None]:
## converting the type into dummies
type_dummies=pd.get_dummies(train_df['Type'])
train_df=pd.concat([train_df,type_dummies],axis=1)
train_df['3']=[0]*train_df.shape[0]

In [None]:
test_type_dummies=pd.get_dummies(test_df['Type'])
test_df=pd.concat([test_df,test_type_dummies],axis=1)

### Analysing the City Feature

In [None]:
train_df['City'].unique() 

Dropping the city name for now as it has too many unique features.

In [None]:
train_df.head()

In [None]:
# dropping all the columns which have been utilized already

train_df.drop(['Open Date','City','City Group','Type'],axis=1,inplace=True)
test_df.drop(['Open Date','City','City Group','Type'],axis=1,inplace=True)

In [None]:
train_df.head()

In [None]:
test_df.head()

## Machine Learning

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import KFold

In [None]:
print('Train Data Shape After EDA:',train_df.shape)
print('Test Data Shape After EDA:',test_df.shape)

In [None]:
Y=train_df['revenue']
train_df.drop('revenue',axis=1,inplace=True)
X=train_df.values
X_Test=test_df.values

In [None]:
X.shape,X_Test.shape

In [None]:
regressor_models={
    'Linear Regression':LinearRegression(),
    'Decision Tree Regressor':DecisionTreeRegressor(),
    'Random Forest Regressor':RandomForestRegressor(),
    'SVR':SVR(),
}

In [None]:
def get_rmse_score(model,x_train,x_test,y_train,y_test):
    model.fit(x_train,y_train)
    y_predicted=model.predict(x_test)
    r2_score=model.score(x_test,y_test)
    rmse=sqrt(mean_squared_error(y_test,y_predicted))    
    return rmse,r2_score

In [None]:
kf=KFold(n_splits=10)
rmse_results={} # storing the rmse results for each regressor models

for train_idx,test_idx in kf.split(X,Y):
    for name,model in regressor_models.items():
        x_train,x_test=X[train_idx],X[test_idx]
        y_train,y_test=Y[train_idx],Y[test_idx]
        rmse,_=get_rmse_score(model,x_train,x_test,y_train,y_test)
        
        if(name not in rmse_results):
            rmse_results[name]=[]
            
        rmse_results[name].append(rmse)

In [None]:
for name,scores in rmse_results.items():
    np_scores=np.array(scores)
    print(name,'RMSE: ',np_scores.mean())

SVR seems to be performing the best of all so using it for making predictions

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,random_state=42)

In [None]:
svr_model=SVR()
svr_model.fit(x_train,y_train)
Y_Test_predictions=svr_model.predict(X_Test)

In [None]:
predictions=[]

for index in range(len(Y_Test_predictions)):
        predictions.append([test_indexes[index],Y_Test_predictions[index]])

In [None]:
predictions_df=pd.DataFrame(predictions,columns=['Id','Prediction'])
predictions_df.to_csv('SVM_Predictions.csv',index=False)