## 附件

交叉验证策略

[Scikit-learn validator](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html)

[Cross-validation for time series](https://robjhyndman.com/hyndsight/tscv/)

[Ordered cross-validation](https://github.com/MaxHalford/xam/blob/master/docs/model-selection.md#ordered-cross-validation)

[Cross-Validation Methodology Using '16 Golden Week](https://www.kaggle.com/c/recruit-restaurant-visitor-forecasting/discussion/45266)

## import part

In [100]:
import glob, re
import numpy as np
import pandas as pd

from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics


## Init

In [140]:

data = {
    'tra': pd.read_csv('./data/air_visit_data.csv'),
    'as': pd.read_csv('./data/air_store_info.csv'),
    'hs': pd.read_csv('./data/hpg_store_info.csv'),
    'ar': pd.read_csv('./data/air_reserve.csv'),
    'hr': pd.read_csv('./data/hpg_reserve.csv'),
    'id': pd.read_csv('./data/store_id_relation.csv'),
    'tes': pd.read_csv('./data/sample_submission.csv'),
    'hol': pd.read_csv('./data/date_info.csv').rename(columns={'calendar_date':'visit_date'})
    }

# 初始处理:
#       1. 从tes数据id中提取air_store_id和visit_datetime
#       2. 在HPG预订信息中匹配air_store_id
#       2. 转换visit_datetime和reserve_datetime为时间格式
print("Init.")
# 1.
data['tes']['visit_date'] = data['tes']['id'].map(lambda x: str(x).split('_')[2])
data['tes']['air_store_id'] = data['tes']['id'].map(lambda x: '_'.join(x.split('_')[:2]))
# 2.
data['hr'] = pd.merge(data['hr'], data['id'], how='inner', on=['hpg_store_id'])
# 3.
for k in data:
    if 'visit_date' in data[k].columns:
        data[k]['visit_date'] = pd.to_datetime(data[k]['visit_date'])
    if 'visit_datetime' in data[k].columns:
        data[k]['visit_date'] = pd.to_datetime(data[k]['visit_datetime'].str.split().str[0])
        data[k].drop('visit_datetime', axis=1, inplace=True)
    if 'reserve_datetime' in data[k].columns:
        data[k]['reserve_date'] = pd.to_datetime(data[k]['reserve_datetime'].str.split().str[0])
        data[k].drop('reserve_datetime', axis=1, inplace=True)


Init.


## 定义特征抽取函数

In [153]:
# 为df加上时间标记: dow, month, year, date
def time_feats(df, dt_col='visit_date'):
    df[dt_col] = pd.to_datetime(df[dt_col])
    df.loc[:, 'dow'] = df[dt_col].dt.dayofweek
    df.loc[:, 'month'] = df[dt_col].dt.month
    df.loc[:, 'season'] = df[dt_col].dt.quarter
    df.loc[:, 'week'] = df[dt_col].dt.weekofyear
    df.loc[:, 'date_int'] = df[dt_col].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
    return df

# 预订数据特征
# WARNING: 测试数据中没有reserve数据, 只有reserve_visit数据 
def order_feats(order_data):
    '''
    Return:
    -----
    `air_store_id`, `visit_date`, ...
    '''
    order = order_data.copy()
    # 创建新特征datetime_diff: 表示到店时间和预订时间的差值
    order['reserve_date_diff'] = order.apply(
        lambda r: (r['visit_date'] - r['reserve_date']).days,
        axis=1)

    # 以(air_store_id, visit_date)为分组计算预订时间差和预订人数的总和(sum: tmp1)与均值(mean: tmp2)
    tmp1 = order.groupby(['air_store_id','visit_date'],
        as_index=False)[['reserve_date_diff', 'reserve_visitors']].sum().rename(
        columns={'visit_date':'visit_date', 'reserve_date_diff': 'rs1', 'reserve_visitors':'rv1'})
    tmp2 = order.groupby(['air_store_id','visit_date'],
        as_index=False)[['reserve_date_diff', 'reserve_visitors']].mean().rename(
        columns={'visit_date':'visit_date', 'reserve_date_diff': 'rs2', 'reserve_visitors':'rv2'})
    order = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','visit_date'])
    return order


# 以(air_store_id, dow)为分组，计算visitors的最小值，均值，中位数，最大值，样本大小
# 主要计算了时间（dow）相关的信息
def store_x_dow(tra, tes):
    '''
    Return:
    -----
    `air_store_id`, `dow`, ...
    '''
    unique_stores = tes['air_store_id'].unique()
    stores = pd.concat([pd.DataFrame({'air_store_id': unique_stores,
                                      'dow': [i]*len(unique_stores)}) for i in range(7)],
                        axis=0, ignore_index=True).reset_index(drop=True)

    tmp = tra.groupby(['air_store_id','dow']).agg(
        {'visitors': [np.min,np.mean,np.median,np.max,np.size]}).reset_index()
    tmp.columns = ['air_store_id', 'dow', 'min_visitors', 'mean_visitors',
                   'median_visitors','max_visitors','count_observations']
    stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
    return stores


def genre_feats(store_data):
    '''
    Return:
    -----
    `air_store_id`, ...
    '''
    store_info = store_data[['air_store_id', 'air_genre_name']].copy()
    store_info['air_genre_name'] = store_info['air_genre_name'].map(
        lambda x: str(str(x).replace('/',' ')))

    lbl = LabelEncoder()
    max_genre = np.max((store_info['air_genre_name'].str.split().apply(lambda x: len(x))))
    for i in range(max_genre):
        store_info['air_genre_name'+str(i)] = lbl.fit_transform(store_info['air_genre_name'].map(
            lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ''))
    store_info['air_genre_name'] = lbl.fit_transform(store_info['air_genre_name'])
    return store_info


def area_feats(store_data):
    '''
    Return:
    -----
    `air_store_id`, ...
    '''
    store_info = store_data[['air_store_id', 'air_area_name', 'latitude', 'longitude']].copy()
    # 区域名称特征
    store_info['air_area_name'] = store_info['air_area_name'].map(lambda x: str(str(x).replace('-',' ')))

    lbl = LabelEncoder()
    max_area = np.max((store_info['air_area_name'].str.split().apply(lambda x: len(x))))
    for i in range(max_area):
        store_info['air_area_name'+str(i)] = lbl.fit_transform(store_info['air_area_name'].map(
            lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ''))
    store_info['air_area_name'] = lbl.fit_transform(store_info['air_area_name'])

    # 区域坐标特征
    # 经纬度特征
    store_info['var_max_lat'] = store_info['latitude'].max() - store_info['latitude']
    store_info['var_max_long'] = store_info['longitude'].max() - store_info['longitude']
    # NEW FEATURES FROM Georgii Vyshnia
    # 经度 + 纬度?
    store_info['lon_plus_lat'] = store_info['longitude'] + store_info['latitude']

    return store_info


def holiday_feats(holiday):
    '''
    Return:
    -----
    `visit_date`, ...
    '''
    tmp = holiday.drop('day_of_week', axis=1)
    # 周末的holiday_flg置为0
    tmp.loc[(tmp['visit_date'].dt.dayofweek>4) & tmp['holiday_flg']==1, :] = 0
    return tmp


def mean_avg(tra, hol):
    '''
    Return:
    -----
    `air_store_id`, `dow`, `holiday_flg`, `visitors_mv`
    '''
    air_visit_data = tra.copy()
    date_info = hol.copy()
    # 把周末的holiday_flag置为0
    date_info.loc[(date_info['visit_date'].dt.dayofweek>4) & date_info['holiday_flg']==1, :] = 0

    # 根据hol.index计算日期对应的权重weight
    date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5

    # 在visit_data中匹配日期weight
    visit_data = air_visit_data.merge(date_info, on='visit_date', how='left')

    # 将访问量转化为对数访问量log1p
    visit_data['visitors'] = visit_data.visitors.map(pd.np.log1p)

    # 按照 air_store_id, dow, holiday_flg 的分组计算加权平均
    visitors = visit_data.groupby(['air_store_id', 'dow', 'holiday_flg']).apply(
        lambda x:( (x['weight'] * x['visitors']).sum() / x['weight'].sum() )).reset_index()
    visitors.rename(columns={0:'visitors_mv'}, inplace=True) 
    return visitors


## 特征工程

In [142]:
train = time_feats(data['tra'])
test = time_feats(data['tes'])

'''
for df in [train, test]:
    print("Processing Order data...")
    df = pd.merge(df, order_feats(data['ar']), how='left', on=['air_store_id', 'visit_date'])
    df = pd.merge(df, order_feats(data['hr']), how='left', on=['air_store_id', 'visit_date'])
    print("Processing Store X dow...")
    df = pd.merge(df, store_x_dow(data['tra'], data['tes']), how='left', on=['air_store_id', 'dow'])
    print("Processing Genre_feats...")
    df = pd.merge(df, genre_feats(data['as']), how='left', on=['air_store_id'])
    print("Processing Area_feats...")
    df = pd.merge(df, area_feats(data['as']), how='left', on=['air_store_id'])
    print("Processing Holiday_feats...")
    df = pd.merge(df, holiday_feats(data['hol']), how='left', on=['visit_date'])
    df.fillna(-1)
'''

print("Train set features.")
print("Processing Order data...")
train = pd.merge(train, order_feats(data['ar']), how='left', on=['air_store_id', 'visit_date'])
train = pd.merge(train, order_feats(data['hr']), how='left', on=['air_store_id', 'visit_date'])
print("Processing Store X dow...")
train = pd.merge(train, store_x_dow(data['tra'], data['tes']), how='left', on=['air_store_id', 'dow'])
print("Processing Genre_feats...")
train = pd.merge(train, genre_feats(data['as']), how='left', on=['air_store_id'])
print("Processing Area_feats...")
train = pd.merge(train, area_feats(data['as']), how='left', on=['air_store_id'])
print("Processing Holiday_feats...")
train = pd.merge(train, holiday_feats(data['hol']), how='left', on=['visit_date'])
train.fillna(0, inplace=True)


print("Test set features.")
print("Processing Order data...")
test = pd.merge(test, order_feats(data['ar']), how='left', on=['air_store_id', 'visit_date'])
test = pd.merge(test, order_feats(data['hr']), how='left', on=['air_store_id', 'visit_date'])
print("Processing Store X dow...")
test = pd.merge(test, store_x_dow(data['tra'], data['tes']), how='left', on=['air_store_id', 'dow'])
print("Processing Genre_feats...")
test = pd.merge(test, genre_feats(data['as']), how='left', on=['air_store_id'])
print("Processing Area_feats...")
test = pd.merge(test, area_feats(data['as']), how='left', on=['air_store_id'])
print("Processing Holiday_feats...")
test = pd.merge(test, holiday_feats(data['hol']), how='left', on=['visit_date'])
test.fillna(0, inplace=True)

# 增加预订总和以及预订平均值特征
print("Processing Total_order_feats...")
train['total_reserv_sum'] = train['rv1_x'] + train['rv1_y']
train['total_reserv_mean'] = (train['rv2_x'] + train['rv2_y']) / 2
train['total_reserv_dt_diff_mean'] = (train['rs2_x'] + train['rs2_y']) / 2

test['total_reserv_sum'] = test['rv1_x'] + test['rv1_y']
test['total_reserv_mean'] = (test['rv2_x'] + test['rv2_y']) / 2
test['total_reserv_dt_diff_mean'] = (test['rs2_x'] + test['rs2_y']) / 2

# 经纬度特征
print("Processing lat&lon_feats...")
train['var_max_lat'] = train['latitude'].max() - train['latitude']
train['var_max_long'] = train['longitude'].max() - train['longitude']
test['var_max_lat'] = test['latitude'].max() - test['latitude']
test['var_max_long'] = test['longitude'].max() - test['longitude']

# NEW FEATURES FROM Georgii Vyshnia
# 经度 + 纬度?
train['lon_plus_lat'] = train['longitude'] + train['latitude']
test['lon_plus_lat'] = test['longitude'] + test['latitude']

Train set features.
Processing Order data...
Processing Store X dow...
Processing Genre_feats...
Processing Area_feats...
Processing Holiday_feats...
Test set features.
Processing Order data...
Processing Store X dow...
Processing Genre_feats...
Processing Area_feats...
Processing Holiday_feats...
Processing Total_order_feats...
Processing lat&lon_feats...


### 移动平均预测特征

In [143]:
# 添加移动平均预测特征
print("Processing Mean Average feats...")
visitors_mv = mean_avg(data['tra'], data['hol'])

print("\tProcessing Train set")
train = pd.merge(train, visitors_mv, how='left', on=['air_store_id', 'dow', 'holiday_flg'])
miss_idx = train['visitors_mv'].isnull()
train.loc[miss_idx, 'visitors_mv'] = train[miss_idx].merge(visitors_mv.loc[visitors_mv.holiday_flg==0, ['air_store_id', 'dow', 'visitors_mv']],
                                                           on=('air_store_id', 'dow'), how='left')['visitors_mv_y'].values
miss_idx = train['visitors_mv'].isnull()
train.loc[miss_idx, 'visitors_mv'] = train[miss_idx].merge(visitors_mv[['air_store_id', 'visitors_mv']].groupby('air_store_id').mean().reset_index(),
                                                           on='air_store_id', how='left')['visitors_mv_y'].values

print("\tProcessing Test set")
test = pd.merge(test, visitors_mv, how='left', on=['air_store_id', 'dow', 'holiday_flg'])
miss_idx = test['visitors_mv'].isnull()
test.loc[miss_idx, 'visitors_mv'] = test[miss_idx].merge(visitors_mv.loc[visitors_mv.holiday_flg==0, ['air_store_id', 'dow', 'visitors_mv']],
                                                           on=('air_store_id', 'dow'), how='left')['visitors_mv_y'].values
miss_idx = test['visitors_mv'].isnull()
test.loc[miss_idx, 'visitors_mv'] = test[miss_idx].merge(visitors_mv[['air_store_id', 'visitors_mv']].groupby('air_store_id').mean().reset_index(),
                                                           on='air_store_id', how='left')['visitors_mv_y'].values


Processing Mean Average feats...
	Processing Train set
	Processing Test set


### 标准化 & 变量转换

In [None]:
###
pass


In [155]:
train.dtypes

air_store_id                         object
visit_date                   datetime64[ns]
visitors                              int64
dow                                   int64
month                                 int64
season                                int64
week                                  int64
rs1_x                               float64
rv1_x                               float64
rs2_x                               float64
rv2_x                               float64
rs1_y                               float64
rv1_y                               float64
rs2_y                               float64
rv2_y                               float64
min_visitors                        float64
mean_visitors                       float64
median_visitors                     float64
max_visitors                        float64
count_observations                  float64
air_genre_name                        int64
air_genre_name0                       int64
air_genre_name1                 

In [209]:
train.columns

Index(['air_store_id', 'visit_date', 'visitors', 'dow', 'month', 'season',
       'week', 'rs1_x', 'rv1_x', 'rs2_x', 'rv2_x', 'rs1_y', 'rv1_y', 'rs2_y',
       'rv2_y', 'min_visitors', 'mean_visitors', 'median_visitors',
       'max_visitors', 'count_observations', 'air_genre_name',
       'air_genre_name0', 'air_genre_name1', 'air_genre_name2',
       'air_area_name', 'latitude', 'longitude', 'air_area_name0',
       'air_area_name1', 'air_area_name2', 'air_area_name3', 'air_area_name4',
       'air_area_name5', 'air_area_name6', 'var_max_lat', 'var_max_long',
       'lon_plus_lat', 'holiday_flg', 'total_reserv_sum', 'total_reserv_mean',
       'total_reserv_dt_diff_mean', 'visitors_mv'],
      dtype='object')

## Model and Predict #1

In [175]:
# 辅助计时函数
def time_cnt(delta):
    total_secs = int(delta.total_seconds())
    format = '{h}H: {m}M: {s}S'
    h, m = total_secs // 3600, total_secs % 3600
    m, s = m // 60, m % 60
    return format.format(h=h, m=m, s=s)

In [179]:
col = [c for c in train if c not in ['id', 'air_store_id', 'visit_date','visitors']]
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred)**0.5
time0 = datetime.now()
print("Modeling...")
model1 = GradientBoostingRegressor(learning_rate=0.2, random_state=3)
model2 = KNeighborsRegressor(n_jobs=-1, n_neighbors=4)

print("Training...")
model1.fit(train[col], np.log1p(train['visitors'].values))
model2.fit(train[col], np.log1p(train['visitors'].values))

visitors_pred1 = model1.predict(train[col])
visitors_pred2 = model2.predict(train[col])

print('RMSE GradientBoostingRegressor: ', RMSLE(np.log1p(train['visitors'].values), visitors_pred1))
print('RMSE KNeighborsRegressor: ', RMSLE(np.log1p(train['visitors'].values), visitors_pred2))
print(time_cnt(datetime.now()-time0))

Modeling...
Training...
RMSE GradientBoostingRegressor:  0.497911892637
RMSE KNeighborsRegressor:  0.422111222092
0H: 6M: 16S


## Model and Predict #2 Moving Average

In [152]:
visitors_mv.head()

Unnamed: 0,air_store_id,dow,holiday_flg,visitors_mv
0,air_00a91d42b08b08d9,0,0.0,3.203625
1,air_00a91d42b08b08d9,0,1.0,3.091042
2,air_00a91d42b08b08d9,1,0.0,3.325868
3,air_00a91d42b08b08d9,2,0.0,3.353439
4,air_00a91d42b08b08d9,3,0.0,3.475056


In [93]:
def mean_avg(tra, hol):
    air_visit_data = tra.copy()
    date_info = hol.copy()
    # 把周末的holiday_flag置为0
    date_info.loc[(date_info['visit_date'].dt.dayofweek>4) & date_info['holiday_flg']==1, :] = 0

    # 根据hol.index计算日期对应的权重weight
    date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5

    # 在visit_data中匹配日期weight
    visit_data = air_visit_data.merge(date_info, on='visit_date', how='left')

    # 将访问量转化为对数访问量log1p
    visit_data['visitors'] = visit_data.visitors.map(pd.np.log1p)

    # 按照 air_store_id, dow, holiday_flg 的分组计算加权平均
    visitors = visit_data.groupby(['air_store_id', 'dow', 'holiday_flg']).apply(
        lambda x:( (x['weight'] * x['visitors']).sum() / x['weight'].sum() )).reset_index()
    visitors.rename(columns={0:'visitors'}, inplace=True) 
    return visitors

In [176]:
visitors_mv.head(2)

Unnamed: 0,air_store_id,dow,holiday_flg,visitors_mv
0,air_00a91d42b08b08d9,0,0.0,3.203625
1,air_00a91d42b08b08d9,0,1.0,3.091042


In [178]:
test.head()

Unnamed: 0,id,visitors,visit_date,air_store_id,dow,month,season,week,rs1_x,rv1_x,...,air_area_name5,air_area_name6,var_max_lat,var_max_long,lon_plus_lat,holiday_flg,total_reserv_sum,total_reserv_mean,total_reserv_dt_diff_mean,visitors_mv
0,air_00a91d42b08b08d9_2017-04-23,0,2017-04-23,air_00a91d42b08b08d9,6,4,2,16,0.0,0.0,...,0,0,8.326629,4.519803,175.447598,0.0,0.0,0.0,0.0,1.098612
1,air_00a91d42b08b08d9_2017-04-24,0,2017-04-24,air_00a91d42b08b08d9,0,4,2,17,0.0,0.0,...,0,0,8.326629,4.519803,175.447598,0.0,0.0,0.0,0.0,3.203625
2,air_00a91d42b08b08d9_2017-04-25,0,2017-04-25,air_00a91d42b08b08d9,1,4,2,17,0.0,0.0,...,0,0,8.326629,4.519803,175.447598,0.0,0.0,0.0,0.0,3.325868
3,air_00a91d42b08b08d9_2017-04-26,0,2017-04-26,air_00a91d42b08b08d9,2,4,2,17,0.0,0.0,...,0,0,8.326629,4.519803,175.447598,0.0,0.0,0.0,0.0,3.353439
4,air_00a91d42b08b08d9_2017-04-27,0,2017-04-27,air_00a91d42b08b08d9,3,4,2,17,0.0,0.0,...,0,0,8.326629,4.519803,175.447598,0.0,0.0,0.0,0.0,3.475056


## 生成提交文件

In [202]:
sub1 = test[['id','visitors']].copy()
sub1['visitors'] = (model1.predict(test[col]) + model2.predict(test[col])) / 2
sub1['visitors'] = np.expm1(sub1['visitors']).clip(lower=0.)

In [196]:
sub2 = test[['id', 'visitors_mv']].copy()
sub2['visitors'] = sub2['visitors_mv'].map(pd.np.expm1)

In [197]:
sub2.head()

Unnamed: 0,id,visitors_mv,visitors
0,air_00a91d42b08b08d9_2017-04-23,1.098612,2.0
1,air_00a91d42b08b08d9_2017-04-24,3.203625,23.621632
2,air_00a91d42b08b08d9_2017-04-25,3.325868,26.82313
3,air_00a91d42b08b08d9_2017-04-26,3.353439,27.60092
4,air_00a91d42b08b08d9_2017-04-27,3.475056,31.299646


In [208]:
sub_merge = pd.merge(sub1, sub2, on='id', how='inner')
sub_merge['visitors'] = (sub_merge['visitors_x'] + sub_merge['visitors_y']* 1.1)/2
sub_merge[['id', 'visitors']].to_csv('submission%s.csv' % datetime.now().strftime('%M%S'), index=False)