In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
import matplotlib.pyplot as pylab
import matplotlib.pyplot as plt

from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV)
from sklearn.metrics import (mean_squared_error, mean_absolute_error)
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier # Used for imputing rare / missing values

# Regressors considered:
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge # only model used for final submission

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:


FAKE_DATA_RATIO = 112141
# Set a Random Seed
SEED = 777
# Read Kaggle Provided Data
train = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/train.csv.zip', index_col = 0, parse_dates=[1])
test = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/test.csv.zip', index_col = 0, parse_dates=[1])
print ("Train Dimensions:")
print (train.shape)
print ("Test Dimensions:")
print (test.shape)

# Concatenate train and test together to pre-process and featurize both consistently.
df = pd.concat((test, train), ignore_index=True)
df.describe()



In [None]:
df["Open Date"] = df["Open Date"].apply(pd.to_datetime)
last_date = df["Open Date"].max()
df["Open Date"] = last_date - df["Open Date"] # This becomes a datetime delta object
df["Open Date"] = df["Open Date"].dt.days + 1 # converts the delta object to an int

# Scale "days since opened" so that the marginal impact decreases over time
# This and the similar log transform of City Count below are the modifications that 
# were not in our official competition submission
df["Log Days Opened"] = df["Open Date"].apply(np.log)
df = df.drop(["Open Date"], axis=1)
pylab.rcParams['figure.figsize'] = (8, 6) # Resizes plots
df[["Log Days Opened", "revenue"]].plot(x="Log Days Opened", y="revenue", kind='scatter', title="Log (Days Opened) vs Revenue")

In [None]:
zero_cols = ['P14', 'P15', 'P16', 'P17', 'P18', 'P24', 'P25', 'P26', 'P27', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35', 'P36', 'P37']

# We make a feature that holds this count of zero columns in the above list
df['zeros'] = (df[zero_cols] == 0).sum(1)

pylab.rcParams['figure.figsize'] = (20, 8)
fig, axs = plt.subplots(1,2)

print ("Distribution of new Zeros features:")
# We find there is only 1 row with a zero count between 0 and 17 in the train set, 
df['zeros'].loc[pd.notnull(df.revenue)].value_counts().plot(title="Train Set", kind='bar', ax=axs[0])

# But in the test set there are many rows with an intermediate count of zeros. 
# This is probably an artifact of how the fake test data was generated (conditional 
# dependence between columns was not preserved).
df['zeros'].loc[pd.isnull(df.revenue)].value_counts().plot(title="Test Set", kind='bar', ax=axs[1], color='red')
plt.show()

In [None]:


pylab.rcParams['figure.figsize'] = (6, 4) # Resizes plots

# The two categories of City Group both appear very frequently
train["City Group"].value_counts().plot(title="City Group Distribution in the Train Set", kind='bar')
plt.show()

# But two of the four Restaurant Types (DT and FC), are extremely rare
train["Type"].value_counts().plot(title="Restaurant Type Distribution in the Train Set", kind='bar')
plt.show()

(test["Type"].value_counts() / FAKE_DATA_RATIO).plot(title="Approximate Restaurant Type Distribution in True Test Set", kind='bar', color='Red')
plt.show()

df = df.join(pd.get_dummies(df['City Group'], prefix="CG"))
df = df.join(pd.get_dummies(df['Type'], prefix="T"))

# Since only n-1 columns are needed to binarize n categories, drop one of the new columns.  
# And drop the original columns.
# And also drop the extremely rare restaurant types (which we handleed especially below)
df = df.drop(["City Group", "Type", "CG_Other", "T_MB", "T_DT"], axis=1)
print (df.shape)



In [None]:
# Replace city names with
# count of their frequency in the train + estimated frequency in the test set.
city_counts = (test["City"].value_counts() / FAKE_DATA_RATIO).add(train["City"].value_counts(), fill_value=0)
df["City"] = df["City"].replace(city_counts)
print ("Some example estimated counts of restaurants per city:")
print (city_counts.head())

# Take log of city count so that the marginal effect decreases
df["Log City Count"] = df["City"].apply(np.log) 
df = df.drop(["City"], axis=1)

# That last vertical spread of points are restaurants from Istanbul.
pylab.rcParams['figure.figsize'] = (8, 6) # Resizes plots
df[["Log City Count", "revenue"]].plot(x="Log City Count", y="revenue", kind='scatter', title="Log City Count vs Revenue")

In [None]:
# Impute values for the very rare restaurant types. 
# Instead of trying to predict with values that appear only 1 or 0 times in the train set, 
# we will replace them with one of the other commonly appearing categories by fitting a 
# model that predicts which common category they "should" be.

# tofit are the rows in the train set that belong to one of the common restaurnat types
tofit = df.loc[((df.T_FC==1) | (df.T_IL==1)) & (pd.notnull(df.revenue))]
# tofill are rows in either train or test that belong to one of the rare types
tofill = df.loc[((df.T_FC==0) & (df.T_IL==0))]

print('type training set shape:'), tofit.shape
print('data to impute:'), tofill.shape

# Resaruants with type FC are labeled 1, those with type IL are labeled 0.
y = tofit.T_FC
# Drop the label columns and revenue (which is not in the test set, so can't be used here)
X = tofit.drop(["T_FC", "T_IL", "revenue"], axis=1)

In [None]:
# Define and train a model to impute restaurant type
# The grid below just has a range of values that I've found commonly
# work well with random forest type models (of which ExtraTrees is one).
model_grid = {'max_depth': [None, 8], 'min_samples_split': [4,9,16], 'min_samples_leaf':[1,4], 'max_features':['sqrt', 0.5, None]}
type_model = ExtraTreesClassifier(n_estimators=25, random_state=SEED)

grid = RandomizedSearchCV(type_model, model_grid, n_iter=10, cv=5, scoring="roc_auc")
grid.fit(X, y)

print("Best parameters for Type Model:")
print(grid.best_params_)

type_model.set_params(**grid.best_params_)
type_model.fit(X, y)

imputations = type_model.predict(tofill.drop(["T_FC", "T_IL", "revenue"], axis=1))
df.loc[(df.T_FC==0) & (df.T_IL==0), "T_FC"] = imputations
df = df.drop(["T_IL"], axis=1)

print ("% labeled FC in the training set:"), df.T_FC.mean()
print ("% of imputed values labeled FC:"), np.mean(imputations)

In [None]:


# Now binarize the "P" columns with dummy variables
print ("Pre-binarizing columns:"), len(df.columns)
for col in df.columns:
    if col[0] == 'P':
        print (col), len(df[col].unique()), "unique values"
        df = df.join(pd.get_dummies(df[col], prefix=col))
        df = df.drop([col, df.columns[-1]], axis=1)
print ("Post-binarizing columns:"), len(df.columns)



In [None]:
# Scale all input features to between 0 and 1, critical to do this for KNN or SVR models.
min_max_scaler = MinMaxScaler()
# Don't scale the output - drop it temporarily
rev = df.revenue
df = df.drop(['revenue'], axis=1)

df = pd.DataFrame(data = min_max_scaler.fit_transform(df), columns = df.columns, index=df.index)
df = df.join(rev)

# Done with preprocessing. Let's take a last look at the data before modeling with it.
df.describe()

In [None]:
# Recover original train/train rows based on revenue (which is null for test rows)
train = df.loc[pd.notnull(df.revenue)]
test = df.loc[pd.isnull(df.revenue)].drop(['revenue'], axis=1)

# Scale revenue by sqrt. 
# The purpose is to decrease the influence of the few very large revenue values.
y = train.revenue.apply(np.sqrt)
X = train.drop(["revenue"], axis=1)

In [None]:
# Now define and train a Ridge Regression model. We tested others from the sklearn package:
# SVR, RandomForest, K-nearest Neighbors, but found Ridge consistantly gave the strongest 
# leaderboard results. When training data is small, simplest is often best.
model_grid = [{'normalize': [True, False], 'alpha': np.logspace(0,10)}]
model = Ridge()

# Use a grid search and leave-one-out CV on the train set to find the best regularization parameter to use.
# (might take a minute or two)
grid = GridSearchCV(model, model_grid, cv=LeaveOneOut(), scoring='neg_mean_squared_error')
grid.fit(X, y)
print("Best parameters set found on development set:")
print(grid.best_params_)

# Re-train on full training set using the best parameters found in the last step.
model.set_params(**grid.best_params_)
model.fit(X, y)

In [None]:
# Predict on the test set with the trained model.
submission = pd.DataFrame(columns=['Prediction'],index=test.index, data=model.predict(test))
# Convert back to revenue from sqrt(revenue)
submission.Prediction = submission.Prediction.apply(np.square)
# Add required column name for Kaggle's submission parser:
submission.index.name='Id'
# Write out the submission
submission.to_csv("TFI_Ridge.csv")
# Quick sanity check on the submission
submission.describe().astype(int)

In [None]:


# Revenue from train set for comparison
train[['revenue']].describe().astype(int)



In [None]:


# Another quick comparision. Note the x-axis scale change: the predictions are 
# more conservative and tend to be closer to the mean than the real revenues. 
# This is pretty standard behavior when using RMSE - there are big penalties for 
# being very wrong, so the model will tend towards more moderate predictions.
train[['revenue']].plot(kind='kde', title="Train Revenue Distribution")
submission.columns = ["predicted revenue"]
submission.plot(kind='kde', title="Prediction Revenue Distribution", color='r')
plt.show()



fake data ratio値を311.5から112141に変更しただけ。⑬（⑰）の数値が微変、結果変わらず。⑧をいじる？