In [20]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import geohash as gs
import time
import pprint
%matplotlib inline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [2]:
# Read all the dataframes into memory and print headers for information

all_df = {}
for file in os.listdir("../input"):
    print('Name:', file)
    name = pd.read_csv("../input/"+file)
    print(name.head())
    print(name.shape)
    print('#'*10)

air_store_info = pd.read_csv("../input/air_store_info.csv")
air_reserve = pd.read_csv("../input/air_reserve.csv")
air_visit_data = pd.read_csv("../input/air_visit_data.csv")
store_id_relation = pd.read_csv("../input/store_id_relation.csv")
hpg_reserve = pd.read_csv("../input/hpg_reserve.csv")
hpg_store_info = pd.read_csv("../input/hpg_store_info.csv")
sample_submission = pd.read_csv("../input/sample_submission.csv")
date_info = pd.read_csv("../input/date_info.csv")


## Combining dataframes
### Not all hpg ids have air ids
Because of the inner join we are lossing a lot of data. We have to check whether it is worth removing these datapoints later.

In [3]:
# First hpg_dataframes
print('Shape before merge:', hpg_reserve.shape, store_id_relation.shape, hpg_store_info.shape)
hpg_com_reserve = pd.merge(hpg_reserve, store_id_relation, on='hpg_store_id', how='inner').merge(hpg_store_info, on='hpg_store_id', how='inner')
print('Shape after merge:', hpg_com_reserve.shape)
hpg_com_reserve.index = range(len(hpg_com_reserve))
hpg_com_reserve.head()


In [4]:
# Second air_dataframes
print('Shape before merge:', air_reserve.shape, air_store_info.shape)
air_com_reserve = pd.merge(air_reserve, air_store_info, on='air_store_id', how='inner')
print('Shape after merge:', air_com_reserve.shape)
air_com_reserve.index = range(len(air_com_reserve))
air_com_reserve.head()

### Dealing with geographical coordinates
I want to use geographical coordinates but how??? 
First I have vizualized geo coordinates using a scatter plot below.  
1. We can do a knn clustering to divide them based on clusters and use cluster ids.
2. We can use geohash, these are used for giving one value to lat and lon.
ex. gs.encode(35.658068, 139.751599); 'xn76u5k5239h'


In [5]:
# Plotting latitude and longitude
g1 = plt.scatter(air_com_reserve.longitude, air_com_reserve.latitude, label='air_coor')
g2 = plt.scatter(hpg_com_reserve.longitude, hpg_com_reserve.latitude, c='r', marker='+', label='hpg_coor')
plt.legend()
plt.grid()
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('This looks like japan :D')
plt.show()

In [None]:
# converting latitude and longitude to geohash
# I am not using area name because not all of the entries have them
'''
geohash = []
for ind, row in air_com_reserve.iterrows():
    #air_com_reserve.loc[ind, 'geohash'] = gs.encode(row['latitude'], row['longitude'])
    geohash.append(gs.encode(row['latitude'], row['longitude']))
    #air_com_reserve = pd.concat([air_com_reserve, pd.Series({'geohash':geohash})], axis=1)
    
print(air_com_reserve.head())

geohash = []
for ind1, row1 in hpg_com_reserve.iterrows():
    #hpg_com_reserve.loc[ind1, 'geohash'] = gs.encode(row1['latitude'], row1['longitude'])
    geohash.append(gs.encode(row1['latitude'], row1['longitude']))
    #hpg_com_reserve = pd.concat([hpg_com_reserve, pd.Series({'geohash':geohash})], axis=1)
    
print(hpg_com_reserve.head())
'''

### Convert date time columns to python datetime

In [6]:
# Not all air ids in air_com_resrve dataset exist in hpg_com_resrve (there are some new restaurants are also there)
# lets combine air and hpg dfs
hpg_com_reserve = hpg_com_reserve.drop('hpg_store_id', axis=1)
hpg_com_reserve = hpg_com_reserve.rename(columns={'hpg_genre_name':'genre_name', 'hpg_area_name':'area_name'})
air_com_reserve = air_com_reserve.rename(columns={'air_genre_name':'genre_name', 'air_area_name':'area_name'})

In [7]:
# Concat both dataframes
air_hpg_com_reserve = pd.concat([air_com_reserve, hpg_com_reserve], ignore_index=True)
print(air_hpg_com_reserve.shape)

In [8]:
# DF used: air_hpg_com_reserve, air_visit_data
# Building new features
#air_hpg_com_reserve = pd.concat([air_com_reserve, hpg_com_reserve])
air_hpg_com_reserve['visit_datetime'] = pd.to_datetime(air_hpg_com_reserve['visit_datetime'])
air_hpg_com_reserve['visit_dow'] = air_hpg_com_reserve.visit_datetime.dt.dayofweek
air_hpg_com_reserve['visit_date'] = air_hpg_com_reserve.visit_datetime.dt.date
#air_hpg_com_reserve['visit_week'] = air_hpg_com_reserve.visit_datetime.dt.week # not this week we need week of month
air_hpg_com_reserve['visit_mon'] = air_hpg_com_reserve.visit_datetime.dt.month


air_hpg_com_reserve['reserve_datetime'] = pd.to_datetime(air_hpg_com_reserve['reserve_datetime'])
#air_hpg_com_reserve['res_dow'] = air_hpg_com_reserve.reserve_datetime.dt.dayofweek
#air_hpg_com_reserve['res_date'] = air_hpg_com_reserve.reserve_datetime.dt.date

# This feature does not make sense for me 
#air_hpg_com_reserve['res_in_adv'] = air_hpg_com_reserve['visit_date']-air_hpg_com_reserve['res_date']
#air_hpg_com_reserve['res_in_adv'] = air_hpg_com_reserve.res_in_adv.astype('str').apply(lambda x: int(x.split(' ')[0]))
#air_hpg_com_reserve['res_in_adv'] = air_hpg_com_reserve.res_in_adv.apply(lambda x: 'v_early' if x > pd.Timedelta('100 Days') else 'late')

air_hpg_com_reserve = air_hpg_com_reserve.drop(['area_name', 'visit_datetime'], axis=1) 
air_hpg_com_reserve.index = range(len(air_hpg_com_reserve))
print(air_hpg_com_reserve.head())
print(air_hpg_com_reserve.shape)

In [9]:
air_hpg_com_reduce = []
n_cols = ['air_store_id', 'genre_name', 'latitude', 'longitude', 'reserve_datetime', 'visit_dow', 'visit_date', 'visit_mon']
for key, df in air_hpg_com_reserve.groupby(['air_store_id', 'visit_date']):
    air_hpg_com_dict = {}
    for cols in n_cols:
        if len(df) > 1:
            air_hpg_com_dict[cols] = df[cols].values[0]
            air_hpg_com_dict['reserve_visitors'] = df['reserve_visitors'].sum()
            air_hpg_com_dict['reserve_calls'] = len(df) 
        else:
            air_hpg_com_dict[cols] = df[cols].values[0]
            air_hpg_com_dict['reserve_visitors'] = df['reserve_visitors'].values[0]
            air_hpg_com_dict['reserve_calls'] = 1
    air_hpg_com_reduce.append(air_hpg_com_dict)    
    #if len(df) > 10: break
air_hpg_com_reduce_df = pd.DataFrame(air_hpg_com_reduce)
print(air_hpg_com_reduce_df.head())
print(air_hpg_com_reduce_df.shape)

In [10]:
# Transforming air_visit_date['visit_date'] to datetime and then time
air_visit_data['visit_date'] = pd.to_datetime(air_visit_data['visit_date'])
air_visit_data['visit_date'] = air_visit_data.visit_date.dt.date

# Combining air_hpg_com_reserve, air_visit_data
air_hpg_com_reduce_df = pd.merge(air_hpg_com_reduce_df, air_visit_data, on=['air_store_id', 'visit_date'], how='inner')
print(air_hpg_com_reduce_df.head())
print(air_hpg_com_reduce_df.shape)

In [11]:
# Adding holiday information
# Transforming date_info['calendar_date'] to datetime and then time
date_info = date_info.rename(columns={'calendar_date':'visit_date'})
date_info['visit_date'] = pd.to_datetime(date_info['visit_date'])
date_info['visit_date'] = date_info.visit_date.dt.date

air_hpg_com_reduce_df = pd.merge(air_hpg_com_reduce_df, date_info, on='visit_date', how='inner')
air_hpg_com_reduce_df = air_hpg_com_reduce_df.drop(['day_of_week', 'reserve_datetime'], axis=1)
print(air_hpg_com_reduce_df.head())
print(air_hpg_com_reduce_df.shape)

In [12]:
# Some of the hotels have more than one genre associate to them
hotels_genre = {}
for key, df in air_hpg_com_reserve.groupby('air_store_id'):
    #if len(np.unique(df.genre_name.values)) > 1:
    hotels_genre[key] = '_'.join(list(np.unique(df.genre_name.values)))
pprint.pprint(hotels_genre)

hotels_genre_ids = {}
for ind, vals in enumerate(np.unique(list(hotels_genre.values()))):
    hotels_genre_ids[vals] = ind
pprint.pprint(hotels_genre_ids)

In [29]:
# Return genre from genre_ID
def get_genre_id2name(ID):
    return [key for key, value in hotels_genre_ids.items() if value == ID][0]

#get_genre_id2name(32)

We are engineering two features here:
- Geohash: it is a hash value for geographical coordinates so instead of Latitude and longitude we have one geohash 
- genre-id: we have resturents with more than one genre so we are combinig them and making new ids. 

In [15]:
air_hpg_com_reduce_df['geohash'] = 0 #genre_name
air_hpg_com_reduce_df['genre_id'] = 0
for ind, row in air_hpg_com_reduce_df.iterrows():
    air_hpg_com_reduce_df.loc[ind, 'geohash'] = gs.encode(row['latitude'], row['longitude'])
    air_hpg_com_reduce_df.loc[ind, 'genre_id'] = hotels_genre_ids[hotels_genre[row['air_store_id']]]

air_hpg_com_reduce_df = air_hpg_com_reduce_df.drop(['genre_name', 'latitude', 'longitude'], axis=1)
print(air_hpg_com_reduce_df.head())
print(air_hpg_com_reduce_df.shape)

**In the discussion I have seen people talking about holiday hack (reshifiting day of week depending on holidays) https://www.kaggle.com/c/recruit-restaurant-visitor-forecasting/discussion/49100.  We will do it later for checking its impact.**

#### Plot to visualize any pattern between features and visitor counts.

In [19]:
# Plotting latitude and longitude
fig, ax = plt.subplots(3,2, figsize=(18,12))
ax[0,0].scatter(air_hpg_com_reduce_df.geohash, np.log2(air_hpg_com_reduce_df.visitors))
ax[0,0].set_title('geohash vs visitor')
ax[0,1].scatter(np.log2(air_hpg_com_reduce_df.reserve_calls), np.log2(air_hpg_com_reduce_df.visitors))
ax[0,1].set_title('reserve_calls vs visitor')
ax[1,0].scatter(np.log2(air_hpg_com_reduce_df.reserve_visitors), np.log2(air_hpg_com_reduce_df.visitors))
ax[1,0].set_title('reserve_visitors vs visitor')
ax[1,1].scatter(air_hpg_com_reduce_df.visit_mon, np.log2(air_hpg_com_reduce_df.visitors))
ax[1,1].set_title('visit_month vs visitor')
ax[2,0].scatter(air_hpg_com_reduce_df.genre_id, np.log2(air_hpg_com_reduce_df.visitors))
ax[2,0].set_title('genre vs visitor')

In [58]:
# How genre_id correlate with visitors count depending on days between holidays and non-holidays
genre_id = 37
test = air_hpg_com_reduce_df[air_hpg_com_reduce_df['genre_id'] == genre_id]
print('Genre name:', get_genre_id2name(genre_id))
print(test.shape)
print(test.head(5))

sns.set(style="ticks")
#sns.scatter(air_hpg_com_reduce_df.genre_id, np.log2(air_hpg_com_reduce_df.visitors))
sns.boxplot(x="visit_dow", y="visitors", hue="holiday_flg", data=test, palette="PRGn")

Sooo, depending on the tip from Recruit Restaurant Visitor Forecasting holiday hack, we should treat holidays as Saturday and if there is a weekday before then that day as Friday and If the day after holiday is weekday ,treat the day after holiday as Monday. I will try it later because from the plots this does not feel correct.  

In [63]:
## Now we will start the data prepration for modelling
### First we will convert some features to categorical features
# get_dummie variables
features = pd.get_dummies(air_hpg_com_reduce_df)

# Display the first 5 rows of the last 12 columns
features.head(5)

In [64]:
## Convert data to arrays
# Labels are the values we want to predict
labels = np.array(features['visitors'])

# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('visitors', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [71]:
# Creatimg training and test set
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [72]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)


In [73]:
# https://towardsdatascience.com/random-forest-in-python-24d0893d51c0
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 100, max_depth=3, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels);

In [74]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [84]:
test_df = pd.DataFrame(test_features, columns=feature_list)
print(test_df.head())
test_values = pd.DataFrame({'Prediction':predictions,'Truth':test_labels})
print(test_values.head())

In [86]:
res_test = pd.concat([test_df, test_values], axis=1)
print(res_test.head(5))

In [93]:
from sklearn import *
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred)**0.5

def plot_actual_predicted(actual, predicted):
    print('RMSE: ', RMSLE(actual, predicted))
    tmp = pd.DataFrame({'actual': actual, 'predicted': predicted}).sort_values(['actual'])
    plt.scatter(range(tmp.shape[0]), tmp['predicted'], color='green')
    plt.scatter(range(tmp.shape[0]), tmp['actual'], color='blue')
    plt.show()
    del tmp



In [94]:
plot_actual_predicted(test_labels, predictions)