In [None]:
#Import the libraries
import pandas as pd
import numpy as np
import scipy.stats as sp
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import math
from datetime import datetime
import lightgbm as lgb

In [None]:
#Preprocessing

###A function to split a string based on seprator and position of separator
def split(strng, sep, pos):
    strng = strng.split(sep)
    return sep.join(strng[:pos]), sep.join(strng[pos:])

###Merge air_visit_data with sample_submission
df_sub =pd.read_csv('E:/R workspace/recruit-restaurant-visitor-forecasting/sample_submission.csv')
df_sub[['air_store_id','visit_date']] = df_sub.id.apply( 
   lambda x: pd.Series(split(str(x), "_", 2))) 
df_sub = df_sub.drop(columns=['id'])
df = pd.read_csv('E:/R workspace/recruit-restaurant-visitor-forecasting/air_visit_data.csv')
medianValue = round(df['visitors'].median())
df = df.append(df_sub, ignore_index=True, sort=False)

###Resample date and fill missing values with median
df.index = pd.to_datetime(df['visit_date'])
df = df.groupby('air_store_id').apply(lambda g: g['visitors'].resample('D').sum()).reset_index()
df['visit_date'] = df['visit_date'].dt.strftime('%Y-%m-%d')
mask = (df['visitors'] == 0) & (df['visit_date'] < '2017-04-23')
df['visitors'][mask] = medianValue

###Create separate date column values
df['Year'] = pd.DatetimeIndex(df['visit_date']).year
df['Month'] = pd.DatetimeIndex(df['visit_date']).month
df['Day'] = pd.DatetimeIndex(df['visit_date']).day

###Remove extra data
lst = df[df['visit_date'] >= '2017-04-23'].air_store_id.unique()
subDF = df[df['air_store_id'].isin(lst)]

###Take log1p for visitors
subDF['visitors'] = np.log1p(subDF['visitors'])

###Add date_info data
date_df =pd.read_csv('E:/R workspace/recruit-restaurant-visitor-forecasting/date_info.csv')
day = {'Monday': 1,'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}
date_df.day_of_week = [day[item] for item in date_df.day_of_week] 
subDF = pd.merge(subDF, date_df, left_on='visit_date', right_on='calendar_date')

###Add reserve data for air and hpg
reserve_df =pd.read_csv('E:/R workspace/recruit-restaurant-visitor-forecasting/air_reserve.csv')
hpg_df =pd.read_csv('E:/R workspace/recruit-restaurant-visitor-forecasting/hpg_reserve.csv')
rel_df =pd.read_csv('E:/R workspace/recruit-restaurant-visitor-forecasting/store_id_relation.csv')
sub_hpg = pd.merge(hpg_df, rel_df, left_on='hpg_store_id', right_on='hpg_store_id')
n = reserve_df.append(sub_hpg.drop(columns=['hpg_store_id']))
n.index = pd.to_datetime(n['visit_datetime'])
nx = n.groupby('air_store_id').apply(lambda g: g['reserve_visitors'].resample('D').sum()).reset_index()
subDF['visit_date'] = pd.to_datetime(subDF['visit_date'])
subDF = pd.merge(subDF, nx, left_on=['air_store_id','visit_date'], right_on=['air_store_id','visit_datetime'], how='left')

###Remove unnecessary columns and fill missing reserve data values with 0
subDF = subDF.drop(columns=['visit_datetime','calendar_date'])
subDF = subDF.fillna(0)
subDF['reserve_visitors'] = np.log1p(subDF['reserve_visitors'])

###Add store info data
air_df =pd.read_csv('E:/R workspace/recruit-restaurant-visitor-forecasting/air_store_info.csv')
air_df[['air_area']] = air_df.air_area_name.apply( 
   lambda x: pd.Series(split(str(x), "-", 1)[0]))
air_df = air_df.drop(columns=['air_area_name'])
newDF = pd.merge(subDF, air_df, left_on='air_store_id', right_on='air_store_id', how='left')

###Add average visitor and reservation data by genre
avg = newDF[['air_genre_name','visitors','reserve_visitors']].groupby('air_genre_name').mean()
avg = avg.rename(columns={"visitors": "mean_visitors", "reserve_visitors": "mean_reserve_visitors"})
avgData = pd.merge(newDF, avg, left_on='air_genre_name', right_on='air_genre_name')

###Add average visitor and reservation data by area
avgAr = newDF[['air_area','visitors','reserve_visitors']].groupby('air_area').mean()
avgAr = avgAr.rename(columns={"visitors": "mean_visitors_area", "reserve_visitors": "mean_reserve_visitors_area"})
avgArData = pd.merge(avgData, avgAr, left_on='air_area', right_on='air_area')

###One hot encoding for genre and area
enc = pd.get_dummies(avgArData, columns=["air_genre_name","air_area"], prefix=["genre", "area"])
enc.columns = enc.columns.str.replace("/", "_")
enc.columns = enc.columns.str.replace(" ", "_")
enc = enc.rename(columns={"area_Hokkaidō_Abashiri": "area_Hokkaido_Abashiri", "area_Hokkaidō_Asahikawa": "area_Hokkaido_Asahikawa", "area_Hokkaidō_Katō": "area_Hokkaido_Kato", "area_Hokkaidō_Sapporo": "area_Hokkaido_Sapporo", "area_Hyōgo": "area_Hyogo", "area_Tōkyō": "area_Tokyo", "area_Ōsaka": "area_Osaka"})

In [None]:
#LightGBM model
col = [f for f in enc.columns if f not in (['air_store_id','visit_date','visitors'])]
train_df = enc[enc.visit_date < '2017-04-23']
test_df = enc[enc.visit_date >= '2017-04-23']
params = {
    'learning_rate': 0.03,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'sub_feature': 0.9,
    'num_leaves': 100,
    'min_data': 100,
    'min_hessian': 1,
    'verbose': -1,
}
t0 = datetime.now()
lgb_train = lgb.Dataset(train_df[col], train_df['visitors'])
lgb_test = lgb.Dataset(test_df[col], test_df['visitors'])
gbm = lgb.train(params,lgb_train,2300)
pred = gbm.predict(test_df[col])
t1 = datetime.now()
diff = t1 - t0
print(diff)
res = pd.DataFrame()
res['id'] = test_df.apply(lambda row: row.air_store_id +'_'+ str(row.visit_date).split()[0], axis=1)
res['visitors'] = np.expm1(pred)
res['visitors'] = round(res['visitors'])
res.to_csv('lightgbm.csv',index=False)