In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Id : Restaurant id. 

Open Date : opening date for a restaurant

City : City that the restaurant is in. Note that there are unicode in the names. 

City Group: Type of the city. Big cities, or Other. 

Type: Type of the restaurant. FC: Food Court, IL: Inline, DT: Drive Thru, MB: Mobile

P1, P2 - P37: There are three categories of these obfuscated data. Demographic data are gathered from third party providers with GIS systems. These include population in any given area, age and gender distribution, development scales. Real estate data mainly relate to the m2 of the location, front facade of the location, car park availability. Commercial data mainly include the existence of points of interest including schools, banks, other QSR operators.

Revenue: The revenue column indicates a (transformed) revenue of the restaurant in a given year and is the target of predictive analysis. Please note that the values are transformed so they don't mean real dollar values.

In [None]:
df = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/train.csv.zip')
df.shape

In [None]:
test_data = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/test.csv.zip')
test_data.shape

In [None]:
df.head(10)

In [None]:
df.describe()

In [None]:
df.sum().isnull()

In [None]:
fig = plt.subplots(1, figsize=(19, 5))
g1 = sns.countplot(df['City Group']).set_title('Train sample data');

In [None]:
fig = plt.subplots(1, figsize=(19, 5))
g1 = sns.countplot(test_data['City Group']).set_title('Test sample data');

In [None]:
fig, ax = plt.subplots(1,2, figsize=(19, 5))
g1 = sns.countplot(df['Type'], ax = ax[0]).set_title('Train data');
g2 = sns.countplot(test_data['Type'], ax = ax[1]).set_title('Test data');

There is no MB type in the test data, so we should replace it with DT inside test_data

In [None]:
test_data.loc[test_data['Type']=='MB', 'Type'] = 'DT'

In [None]:
fig = plt.subplots(1, figsize=(19, 5))
g1 = sns.countplot(test_data['Type']).set_title('Test sample data');

Data and time processing example: https://towardsdatascience.com/machine-learning-with-datetime-feature-engineering-predicting-healthcare-appointment-no-shows-5e4ca3a85f96

In [None]:
import datetime
df['Open Date']  = pd.to_datetime(df['Open Date'])
test_data['Open Date']  = pd.to_datetime(test_data['Open Date'])

In [None]:
df.head(5)

Columns City and ID also do not carry useful information, so you can delete it

In [None]:
df.drop('Id',axis=1,inplace=True)
df.drop('City', axis=1, inplace = True)
test_data.drop('City', axis = 1, inplace = True)
df.head()

**How many days is each store open?**

The competition started on 23.03.2015

Subtracting the opening date of each store from the start date of the competition, we get how many days each store has worked

After that, the opening date is not needed, so we delete it.

In [None]:
launch_date = datetime.datetime(2015, 3, 23)

df['Days Open'] = (launch_date - df['Open Date']).dt.days / 1000
test_data['Days Open'] = (launch_date - test_data['Open Date']).dt.days / 1000
df.drop('Open Date', axis=1, inplace=True)
test_data.drop('Open Date', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
heatmap_data = df.drop(['revenue','City Group','Type'], axis=1)
matrix = np.triu(heatmap_data.corr())
plt.figure(figsize=(10, 10))
sns.heatmap(heatmap_data.corr(), mask = matrix)
plt.suptitle('Pearson Correlation Heatmap')
plt.show();

In [None]:
#checking distribution for "revenue"
df['revenue'].hist(bins = 100, figsize = [14,8])

**Processing categorical features**

In [None]:
columnsToEncode = df.select_dtypes(include=[object]).columns
df = pd.get_dummies(df, columns=columnsToEncode, drop_first=False)
test_data = pd.get_dummies(test_data, columns=columnsToEncode, drop_first=False)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

df['revenue'] = np.log1p(df['revenue'])
X, y = df.drop('revenue', axis=1), df['revenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression().fit(X_train, y_train)
clf.score(X_train, y_train)

# RandomForestRegressor


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

params_grid = {
    'max_depth': [10, 30, 35, 50, 65, 75, 100],
    'max_features': [.3, .4, .5, .6],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [30, 50, 100, 200]
}

rf_regressor = RandomForestRegressor()
rf_cv_regressor = GridSearchCV(rf_regressor, params_grid, scoring='neg_root_mean_squared_error', cv = 10, n_jobs = -1)
rf_cv_regressor.fit(X_train, y_train)

print('Best params {}'.format(rf_cv_regressor.best_params_))
print('Best score {}'.format(rf_cv_regressor.best_score_))

In [None]:
rf_regressor = RandomForestRegressor(max_depth = 10,
                                     max_features = 0.6,
                                     min_samples_leaf = 3,
                                    min_samples_split = 8,
                                    n_estimators = 50)
rf_regressor.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error
y_train_pred = rf_regressor.predict(X_train)
y_pred = rf_regressor.predict(X_test)
train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Train RMSE: {train_rmse:.4f}')
print(f'Test RMSE: {test_rmse:.4f}')

In [None]:
submission = pd.DataFrame(columns=['Id','Prediction'])
submission['Id'] = test_data['Id']

rf_pred = rf_regressor.predict(test_data.drop('Id', axis=1))
submission['Prediction'] = np.expm1(rf_pred)
submission.to_csv('submission_rf.csv',index=False)