In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option("max_columns", None) #Showing only two columns
pd.set_option("max_rows", None)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!unzip -o /kaggle/input/restaurant-revenue-prediction/test.csv.zip
!unzip -o /kaggle/input/restaurant-revenue-prediction/train.csv.zip

In [None]:
df_train = pd.read_csv('/kaggle/working/train.csv')
df_test = pd.read_csv('/kaggle/working/test.csv')

In [None]:
print(df_train.shape)
print(df_test.shape)

Train Dataset is very small compared to test dataset.

In [None]:
df_train.head()

In [None]:
df_test.head()

## Exploratory Data Analysis

In [None]:
df_train.describe(include='all').T

In [None]:
df_train.info()

As per the description of the dataset,
1. There are 42 independent variables and 1 dependent variable (revenue). 
2. ID column represents serial number so it may not have any significance in the desired target variable.
3. 'Open Date', 'City', 'City Group', 'Type' columns are of datatype Object. Other columns are of type integer or float.
4. There are no null values in the train dataset although we'll explore more insights.

In [None]:
#Checking Number of Unique Values in Each Feature
print('Total Number of Unique Values in Columns:')
for features in df_train:
  if(features != 'revenue'):
    print(str(features)+ ': (Datatype: ' + str(df_train[features].dtype) +') : ' + str(len(df_train[features].unique())))

In [None]:
#Categorizing Discrete and Continuous Features w.r.t. column datatype (Discrete=>'Object' and Continuous=>'Integer and Float')
discrete_features = [feature for feature in df_train.columns if df_train[feature].dtype == 'O']
continuous_features = list(set(df_train.columns) - set(discrete_features))

In [None]:
print(discrete_features)
print(continuous_features)

In [None]:
# List of Unique values of each continuous features
for i in continuous_features:
    print('{} has unique values {}'.format(i,df_train[i].unique()),'\n')

Although P1-P37 features are numeric in nature but values of these features are discrete.

## Missing Vaules Exploration

In [None]:
print("List of Missing Values in Train Dataset: ")
print(df_train.isnull().sum())
print("\n\nList of Missing Values in Test Dataset: ")
print(df_test.isnull().sum())

There are no missing values in the train or test dataset

## Numerical Feature Analysis

In [None]:
# Distribution of Numerical Variables with skewness
fig = plt.figure(figsize=(12,18))
for i in range(len(continuous_features)):
    fig.add_subplot(10,4,i+1)
    sns.distplot(df_train[continuous_features[i]], kde_kws={'bw': 0.1})
    plt.title('Skew : %.2f' % df_train[continuous_features[i]].skew())
    
plt.tight_layout()
plt.show()

Distribution of the Target Variable (Revenue)

In [None]:
sns.distplot(df_train.revenue)

Target Variable is positively skewed and there are some outliers present in the dataset. So we may have to transform the target variable when modelling.

## Discrete Feature Analysis

In [None]:
# Calculating days open for each restaurant

df_train['Open Date'] = pd.to_datetime(df_train['Open Date'], format='%m/%d/%Y')   
df_test['Open Date'] = pd.to_datetime(df_test['Open Date'], format='%m/%d/%Y')

df_train['OpenDays']=""
df_test['OpenDays']=""

dateLastTrain = pd.DataFrame({'Date':np.repeat(['01/01/2015'],[len(df_train)]) })
dateLastTrain['Date'] = pd.to_datetime(dateLastTrain['Date'], format='%m/%d/%Y')  
dateLastTest = pd.DataFrame({'Date':np.repeat(['01/01/2015'],[len(df_test)]) })
dateLastTest['Date'] = pd.to_datetime(dateLastTest['Date'], format='%m/%d/%Y')  

df_train['OpenDays'] = dateLastTrain['Date'] - df_train['Open Date']
df_test['OpenDays'] = dateLastTest['Date'] - df_test['Open Date']

df_train['OpenDays'] = df_train['OpenDays'].astype('timedelta64[D]').astype(int)
df_test['OpenDays'] = df_test['OpenDays'].astype('timedelta64[D]').astype(int)

df_train = df_train.drop('Open Date', axis=1)
df_test = df_test.drop('Open Date', axis=1)

In [None]:
sns.countplot(df_train['City Group'])

In [None]:
sns.countplot(df_train['Type'])

In [None]:
sns.countplot(df_train['City'])

In [None]:
# Counting values of cities
df_train['City'].value_counts()

City column has too many discrete values which are not feasible to transform using One Hot Encoding because then the feature list will become big. So this feature need to be removed.

In [None]:
df_train = df_train.drop('City',axis=1)
df_test = df_test.drop('City',axis=1)

In [None]:
df_train.head(10)

In [None]:
df_test.head(10)

## Feature Engineering

In [None]:
# concat train and test data for feature engineering
df = pd.concat([df_train,df_test],axis=0)

In [None]:
# One Hot Encoding of Categorical features in train and test dataset.
ohe_cols = ['City Group', 'Type']
for col in ohe_cols:
    df = pd.concat([df, pd.get_dummies(df[col], prefix=col, drop_first=True)], axis=1)

In [None]:
# Dropping categorical variables
df = df.drop('City Group', axis=1)
df = df.drop('Type', axis=1)

In [None]:
#Separating Train and Test Dataset
df_train_pp = df.dropna(axis=0)
df_test_pp = df[df['revenue'].isna()]
df_test_pp = df_test_pp.drop('revenue', axis=1)

In [None]:
print(df_train_pp.shape)
print(df_test_pp.shape)

In [None]:
df_train_pp.head()

In [None]:
df_test_pp.head()

## Correlation Analysis

In [None]:
plt.figure(figsize = (50, 50))
corr_mat = df_train_pp.corr()
sns.heatmap(corr_mat, xticklabels = corr_mat.columns, yticklabels = corr_mat.columns, annot=True, cmap='RdYlGn')

In [None]:
s = corr_mat.unstack()
so = s.sort_values(kind="quicksort").drop_duplicates()

#0.1 to 0.3 Slightly Correlated
res1 = so[so>=0.1]
res1 = res1[res1<0.3]
print(res1)

In [None]:
# o.4 to 0.6 is moderately correlated features
res2 = so[so>=0.4]
res2 = res2[res2<0.6]
print(res2)

In [None]:
# Above 0.6 is highly correlated features
res3 = so[so>=0.6]
print(res3)

In [None]:
# features correlated with target variable "Revenue"
corr_mat['revenue'].sort_values(ascending=False)

## Modelling

In [None]:
#Creating X and Y for Modelling by using mostly correllated features of target variable "Revenue"

related_cols = ['OpenDays','P2','P28','P6','Type_FC','P21','City Group_Other','P29','P13','Type_IL']
X = df_train_pp[related_cols]


# As Target variable is skewed, log transformation is used to make it normal distribution

Y = np.log(df_train_pp.revenue)

In [None]:
#Random Forest Regressor

model = RandomForestRegressor(n_estimators=150)
model.fit(X, Y)

test_predicted = pd.DataFrame()
test_predicted['Id'] = df_test_pp.Id
#test_predicted['Prediction'] = model.predict(df_test_pp.drop('Id', axis=1))
#test_predicted['Prediction'] = model.predict(df_test_pp[topfeatures])
test_predicted['Prediction'] = np.exp(model.predict(df_test_pp[related_cols]))
test_predicted.to_csv('submission-rf-regressor.csv', index=False)
test_predicted.describe()

In [None]:
#XGBOOST Regressor

model = xgb.XGBRegressor()
model.fit(X, Y)

test_predicted = pd.DataFrame()
test_predicted['Id'] = df_test_pp.Id
#test_predicted['Prediction'] = model.predict(df_test_pp.drop('Id', axis=1))
#test_predicted['Prediction'] = model.predict(df_test_pp[topfeatures])
test_predicted['Prediction'] = np.exp(model.predict(df_test_pp[related_cols]))
test_predicted.to_csv('submission-xgb-regressor.csv', index=False)
test_predicted.describe()