In [None]:
#Import the libraries
import pandas as pd
import numpy as np
import scipy.stats as sp
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import math

In [None]:
#Pre-processing

###A function to split a string based on seprator and position of separator
def split(strng, sep, pos):
    strng = strng.split(sep)
    return sep.join(strng[:pos]), sep.join(strng[pos:])

###Merge submission and visit data
df_sub =pd.read_csv('E:/R workspace/recruit-restaurant-visitor-forecasting/sample_submission.csv')
df_sub[['air_store_id','visit_date']] = df_sub.id.apply( 
   lambda x: pd.Series(split(str(x), "_", 2))) 
df_sub = df_sub.drop(columns=['id'])
df = pd.read_csv('E:/R workspace/recruit-restaurant-visitor-forecasting/air_visit_data.csv')
medianValue = round(df['visitors'].median())
df = df.append(df_sub, ignore_index=True, sort=False)

###Resample dates and perform median imputation
df.index = pd.to_datetime(df['visit_date'])
df = df.groupby('air_store_id').apply(lambda g: g['visitors'].resample('D').sum()).reset_index()
df['visit_date'] = df['visit_date'].dt.strftime('%Y-%m-%d')
mask = (df['visitors'] == 0) & (df['visit_date'] < '2017-04-23')
df['visitors'][mask] = medianValue

###Create separate columns for date data
df['Year'] = pd.DatetimeIndex(df['visit_date']).year
df['Month'] = pd.DatetimeIndex(df['visit_date']).month
df['Day'] = pd.DatetimeIndex(df['visit_date']).day

###Remove unnecessary restaurant data
lst = df[df['visit_date'] >= '2017-04-23'].air_store_id.unique()
subDF = df[df['air_store_id'].isin(lst)]
subDF['visitors_org'] = subDF['visitors']
subDF['visitors'] = np.log1p(subDF['visitors'])

###Add date info data
date_df =pd.read_csv('E:/R workspace/recruit-restaurant-visitor-forecasting/date_info.csv')
day = {'Monday': 1,'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}
date_df.day_of_week = [day[item] for item in date_df.day_of_week] 
subDF = pd.merge(subDF, date_df, left_on='visit_date', right_on='calendar_date')

###Add reservation data
reserve_df =pd.read_csv('E:/R workspace/recruit-restaurant-visitor-forecasting/air_reserve.csv')
hpg_df =pd.read_csv('E:/R workspace/recruit-restaurant-visitor-forecasting/hpg_reserve.csv')
rel_df =pd.read_csv('E:/R workspace/recruit-restaurant-visitor-forecasting/store_id_relation.csv')
sub_hpg = pd.merge(hpg_df, rel_df, left_on='hpg_store_id', right_on='hpg_store_id')
n = reserve_df.append(sub_hpg.drop(columns=['hpg_store_id']))
n.index = pd.to_datetime(n['visit_datetime'])
nx = n.groupby('air_store_id').apply(lambda g: g['reserve_visitors'].resample('D').sum()).reset_index()
subDF['visit_date'] = pd.to_datetime(subDF['visit_date'])
subDF = pd.merge(subDF, nx, left_on=['air_store_id','visit_date'], right_on=['air_store_id','visit_datetime'], how='left')
subDF = subDF.drop(columns=['visit_datetime'])
subDF = subDF.fillna(0)
subDF['reserve_visitors_org'] = subDF['reserve_visitors']
subDF['reserve_visitors'] = np.log1p(subDF['reserve_visitors'])

In [None]:
#Regression model
dfObjSubm = pd.DataFrame(columns=['id', 'visitors'])
i=0
rmse = []
for item in lst:
    i+=1
    print(i)
    print(item)
    dr = subDF[subDF['air_store_id'] == item] 
    train_df = dr.loc[dr.visit_date < '2017-04-23', ~dr.columns.isin(['air_store_id','visitors_org','calendar_date','reserve_visitors_org'])]
    train_df.set_index('visit_date', inplace=True)
    test_df = dr.loc[dr.visit_date >= '2017-04-23', ~dr.columns.isin(['visitors_org','calendar_date','reserve_visitors_org'])]
    regressor = LinearRegression()  
    regressor.fit(train_df.loc[:, train_df.columns != 'visitors'], train_df['visitors'])
    test_df['visitors'] = regressor.predict(test_df.loc[:, ~test_df.columns.isin(['air_store_id','visitors','visit_date'])])
    test_df['id'] = test_df.apply(lambda row: row.air_store_id +'_'+ str(row.visit_date).split()[0], axis=1)
    dfObjSubm = pd.concat([dfObjSubm, test_df[['id','visitors']]])
dfObjSubm['visitors'] = round(np.expm1(dfObjSubm['visitors']))
dfObjSubm = dfObjSubm.reset_index(drop=True)
dfObjSubm.to_csv('regression.csv', index=False)