In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import scipy.stats as st
import pickle

import gc
import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

In [2]:
def RMSLE(y, pred):
    return mean_squared_error(y, pred) ** 0.5

def hmean(x):
    x = x[~np.isnan(x) & ~np.equal(x, 0)]
    return st.hmean(x) if len(x)>0 else 0

# def mean(x):
#     x = x[~np.isnan(x)]
#     return np.nanmean(x) if len(x)>0 else 0

def decay(x):
    x = x[~np.isnan(x)]
    return np.sum(x * np.power(0.9, np.arange(len(x))[::-1])) if len(x)>0 else 0

def diff(x):
    x = x[~np.isnan(x)]
    return np.mean(np.diff(x)) if len(x)>0 else 0

def ewm(x):
    return x.ewm(alpha=0.5, adjust=True, ignore_na=True).mean()

In [3]:
ainfo = pd.read_csv('data/air_store_info.csv')

idate = pd.read_csv('data/date_info.csv', parse_dates = ['calendar_date']) \
                    .rename(columns={'calendar_date':'visit_date', 'holiday_flg':'holiday'}) \
                    .drop('day_of_week', axis=1)

train = pd.read_csv('data/air_visit_data.csv', parse_dates = ['visit_date'], 
                    converters={'visitors': lambda x: np.log1p(float(x)) if float(x) > 0 else 0})
test = pd.read_csv('data/sample_submission.csv')

weather = pd.read_pickle('data/weather.pkl')

В трейне - 252108 записей
В тесте 32019 записей (821 ресторан). Каждому ресторану соответствует 39 дней, что говорит о том, что есть рестораны с нулевой посещаемостью (когда были закрыты)

In [4]:
# Создаем тест
test['air_store_id'] = test['id'].map(lambda x: '_'.join(x.split('_')[:-1]))
test['visit_date'] = test['id'].map(lambda x: x.split('_')[2])
test['visit_date'] = pd.to_datetime(test['visit_date'])
test.drop(['id', 'visitors'], axis=1, inplace=True) 

In [5]:
# Удаляем в трейне рестораны, которых нет в тесте
store_to_del = list(set(train['air_store_id'].unique()) - set(test['air_store_id'].unique()))
# Удаляем рестораны, в которых нет посетителей с 01.04.2017 (1 ресторан)
store_to_del.extend(list(set(train['air_store_id'].unique()) - set(
                                                    train[train.visit_date>='2017-04-01']['air_store_id'].unique())))
train = train[~train['air_store_id'].isin(store_to_del)]
test = test[~test['air_store_id'].isin(store_to_del)]

closed = train.copy()

train.shape, test.shape

((250084, 3), (31980, 2))

# Даты

25 праздничных дней в 2016 и 10 - в 2017
4 в тесте
between Apr 29 and May 5.

In [6]:
def dbh(row):
    return row['day_before_holiday'] if row['holiday']==0 else 0
def dah(row):
    return row['day_after_holiday'] if row['holiday']==0 else 0
def dih(row):
    return row['day_in_holiday'] if row['holiday']==1 else 0
    
idate['year'] = idate['visit_date'].dt.year
idate['quarter'] = idate['visit_date'].dt.quarter
idate['month'] = idate['visit_date'].dt.month
idate['day'] = idate['visit_date'].dt.day
idate['dayofweek'] = idate['visit_date'].dt.dayofweek
idate['dayofyear'] = idate['visit_date'].dt.dayofyear
idate['weekofyear'] = idate['visit_date'].dt.weekofyear
# idate['days_in_month'] = idate['visit_date'].dt.daysinmonth
idate['weekofmonth'] = idate['visit_date'].apply(lambda d: (d.day-1) // 7 + 1)
idate['dayincomp'] = idate['visit_date'].apply(lambda x: x.dayofyear if x.year==2016 else x.dayofyear+366)

idate['day_before_holiday'] = idate['holiday'].shift(-1).fillna(0)
idate['day_before_holiday'] = idate.apply(dbh, axis=1).astype(int)

idate['day_after_holiday'] = idate['holiday'].shift(1).fillna(0)
idate['day_after_holiday'] = idate.apply(dah, axis=1).astype(int)

idate['day_in_holiday'] = idate['holiday'].rolling(window=4, min_periods=1).sum()
idate['day_in_holiday'] = idate.apply(dih, axis=1).astype(int)
mask = (idate['visit_date'] == '2017-01-02')
idate.loc[mask, 'day_in_holiday'] = int(5)
mask = (idate['visit_date'] == '2017-01-03')
idate.loc[mask, 'day_in_holiday'] = int(6)

tmp = idate.groupby(['year', 'weekofyear'], as_index=False)['holiday'].sum().rename(columns={'holiday': 'holidays_thisweek'})
idate = pd.merge(idate, tmp, on=['year', 'weekofyear'], how='left')
mask = (idate['weekofyear'] == 52)
idate.loc[mask, 'holidays_thisweek'] = int(4)

idate['holidays_lastweek'] = idate['holidays_thisweek'].shift(7).fillna(0).astype(int)
idate['holidays_nextweek'] = idate['holidays_thisweek'].shift(-7).fillna(0).astype(int)

idate['golden_week'] = int(0)
mask = ((idate['visit_date'] >= '2016-04-29') & (idate['visit_date'] <= '2016-05-05'))
idate.loc[mask, 'golden_week'] = int(1)
mask = ((idate['visit_date'] >= '2017-04-29') & (idate['visit_date'] <= '2017-05-05'))
idate.loc[mask, 'golden_week'] = int(1)

# День в золотой неделе
idate['day_in_golden_week'] = int(0)
j = 1
for i in pd.date_range(datetime(2016, 4, 29).date(), periods=7):
    mask = (idate['visit_date'] == i)
    idate.loc[mask, 'day_in_golden_week'] = int(j)
    j=j+1
j = 1
for i in pd.date_range(datetime(2017, 4, 29).date(), periods=7):
    mask = (idate['visit_date'] == i)
    idate.loc[mask, 'day_in_golden_week'] = int(j)
    j=j+1
    
# День до золотой недели
idate['day_before_golden_week'] = int(0)
j = 7
for i in pd.date_range(datetime(2016, 4, 22).date(), periods=7):
    mask = (idate['visit_date'] == i)
    idate.loc[mask, 'day_before_golden_week'] = int(j)
    j=j-1
j = 7
for i in pd.date_range(datetime(2017, 4, 22).date(), periods=7):
    mask = (idate['visit_date'] == i)
    idate.loc[mask, 'day_before_golden_week'] = int(j)
    j=j-1
    
# День после золотой недели
idate['day_after_golden_week'] = int(0)
j = 1
for i in pd.date_range(datetime(2016, 5, 6).date(), periods=7):
    mask = (idate['visit_date'] == i)
    idate.loc[mask, 'day_after_golden_week'] = int(j)
    j=j+1
j = 1
for i in pd.date_range(datetime(2017, 5, 6).date(), periods=7):
    mask = (idate['visit_date'] == i)
    idate.loc[mask, 'day_after_golden_week'] = int(j)
    j=j+1

In [7]:
train = pd.merge(train, idate, on=['visit_date'], how='left')
test = pd.merge(test, idate, on=['visit_date'], how='left')
train.shape, test.shape

((250084, 23), (31980, 22))

# Рестораны с выходными

1. Можно удалить по holidays
2. Можно менять коэффициент 0.1

In [8]:
# Заполняем нулевую посещаемость
closed = closed.groupby('air_store_id').apply(lambda x: x.set_index('visit_date').resample('1D').first())
closed = closed.drop(['air_store_id'], axis=1).reset_index().fillna(0)

# Группируем по дням недели и устанавливаем признак работы ресторана
closed = pd.merge(closed, idate, on=['visit_date'], how='left')
closed = closed.filter(items=['air_store_id', 'dayofweek', 'visitors', 'holiday'])
closed['visitors'] = closed['visitors'].apply(lambda x: 0 if x==0 else 1)
closed = closed.groupby(['air_store_id', 'dayofweek'], as_index=False)['visitors'].mean()

closed = closed[closed['visitors']>=0.1]
test = pd.merge(test, closed, on=['air_store_id', 'dayofweek'], how='left')
test.dropna(inplace=True)
test.drop('visitors', axis=1, inplace=True)
print(test.shape)

del closed

(30739, 22)


In [9]:
closed = test.filter(items=['air_store_id', 'dayofweek'])

closed = closed.drop_duplicates()

train = train.merge(closed, indicator=True, on=['air_store_id', 'dayofweek'], how='left')
train = train[train._merge=='both']
train.drop('_merge', axis=1, inplace=True)
print(train.shape)

del closed

(249691, 23)


# Данные о ресторанах

1. Координат больше, чем зон
2. Можно выделить города

In [10]:
ainfo['coordinate_area'] = ainfo['latitude'].astype(str)+'_'+ainfo['longitude'].astype(str)
ainfo['air_area_name'] = ainfo['air_area_name'].apply((lambda x: x.split('-')[0]))
ainfo['air_area_name'] = ainfo['air_area_name'].apply((lambda x: x.split(' ')[0]))

le = LabelEncoder()
enc_list = ['air_store_id', 'air_genre_name', 'air_area_name', 'coordinate_area']
for col in enc_list:
    ainfo[col+'_le'] = le.fit_transform(ainfo[col])
    
cat_feat = ['air_store_id']
cat_feat.extend(ainfo.columns[ainfo.columns.str.endswith('_le')])
ainfo = ainfo.filter(items=cat_feat)

In [11]:
train = pd.merge(train, ainfo, on=['air_store_id'], how='left')
test = pd.merge(test, ainfo, on=['air_store_id'], how='left')

In [12]:
clear_feat = train.columns
clear_feat

Index(['air_store_id', 'visit_date', 'visitors', 'holiday', 'year', 'quarter',
       'month', 'day', 'dayofweek', 'dayofyear', 'weekofyear', 'weekofmonth',
       'dayincomp', 'day_before_holiday', 'day_after_holiday',
       'day_in_holiday', 'holidays_thisweek', 'holidays_lastweek',
       'holidays_nextweek', 'golden_week', 'day_in_golden_week',
       'day_before_golden_week', 'day_after_golden_week', 'air_store_id_le',
       'air_genre_name_le', 'air_area_name_le', 'coordinate_area_le'],
      dtype='object')

In [13]:
test['visitors'] = 0 
train.sort_index(axis=1, inplace=True)
test.sort_index(axis=1, inplace=True)
df = pd.concat([train, test]).reset_index(drop=True)
test.drop('visitors', axis=1, inplace=True)

# СТАТИСТИКИ

## ТРЕНДЫ 

усовершенствовать

In [14]:
def trend(SHIFT=1):
    statistics = df.filter(items=['air_area_name_le', 'air_genre_name_le', 'coordinate_area_le','air_store_id_le', 'year',
                                  'month', 'day', 'dayofweek', 'holiday', 'visitors', 'visit_date'])

    statistics['month_in_comp'] = statistics['visit_date'].apply(lambda x: x.month if x.year==2016 else x.month+12)
#     statistics['T_mean'] = 0

    S=[]
    SH=[]
    SD=[]
    SDH=[]
    ADH=[]

    for i in range(1,17):

        mask1 = (statistics['month_in_comp'].isin(range(1,i+1)))

#         mean = statistics.loc[mask1, 'visitors'].mean()
#         mask2 = (statistics['month_in_comp'] == i+SHIFT)
#         statistics.loc[mask2, 'T_mean'] = mean

#         tmp = statistics.loc[mask1, :].groupby(['air_store_id_le'], as_index=False)['visitors']\
#         .agg({'T_mean_S': 'mean', 'T_std_S': 'std'})
#         tmp['month_in_comp'] = i+SHIFT
#         S.append(tmp)

#         tmp = statistics.loc[mask1, :].groupby(['air_store_id_le', 'dayofweek'], as_index=False)['visitors']\
#         .agg({'T_decay_SD': decay})
#         tmp['month_in_comp'] = i+SHIFT
#         SD.append(tmp)

        tmp = statistics.loc[mask1, :].groupby(['air_store_id_le', 'holiday'], as_index=False)['visitors']\
        .agg({'T_std_SH': 'std'})
        tmp['month_in_comp'] = i+SHIFT
        SH.append(tmp)

        tmp = statistics.loc[mask1, :].groupby(['air_store_id_le', 'dayofweek', 'holiday'], as_index=False)['visitors']\
        .agg({'T_mean_SDH': 'mean'})
        tmp['month_in_comp'] = i+SHIFT
        SDH.append(tmp)

#         tmp = statistics.loc[mask1, :].groupby(['air_genre_name_le', 'dayofweek', 'holiday'], as_index=False)['visitors']\
#         .agg({'T_mean_ADH': 'mean'})
#         tmp['month_in_comp'] = i+SHIFT
#         ADH.append(tmp)


#     tmp = pd.concat(S, axis=0).reset_index(drop=True)
#     statistics = pd.merge(statistics, tmp, on=['air_store_id_le', 'month_in_comp'], how='left')

#     tmp = pd.concat(SD, axis=0).reset_index(drop=True)
#     statistics = pd.merge(statistics, tmp, on=['air_store_id_le', 'month_in_comp', 'dayofweek'], how='left')

    tmp = pd.concat(SH, axis=0).reset_index(drop=True)
    statistics = pd.merge(statistics, tmp, on=['air_store_id_le', 'month_in_comp', 'holiday'], how='left')

    tmp = pd.concat(SDH, axis=0).reset_index(drop=True)
    statistics = pd.merge(statistics, tmp, on=['air_store_id_le', 'month_in_comp', 'dayofweek', 'holiday'], how='left')

#     tmp = pd.concat(ADH, axis=0).reset_index(drop=True)
#     statistics = pd.merge(statistics, tmp, on=['air_genre_name_le', 'month_in_comp', 'dayofweek', 'holiday'], how='left')

    del SH, SDH

    statistics.fillna(0, inplace=True)
    statistics.drop(['air_area_name_le', 'air_genre_name_le', 'year', 'month', 'day', 'dayofweek', 'holiday', 'visitors', 'coordinate_area_le', 'month_in_comp'], 
                    axis=1, inplace=True)

    
    gc.collect()
    
    return statistics

## НЕДАВНИЕ

### Предгруппировка

In [15]:
%%time

statistics = df.filter(items=['air_area_name_le', 'air_genre_name_le', 'coordinate_area_le','air_store_id_le', 'year',
                                  'month', 'day', 'dayofweek', 'holiday', 'visitors', 'visit_date'])

####################################################### ПО КУХНЕ И ДНЯМ НЕДЕЛИ
grouped = statistics.groupby(['air_genre_name_le', 'dayofweek', 'visit_date'], as_index=False)['visitors'].sum()
grouped_GD = grouped.groupby(['air_genre_name_le', 'dayofweek']).apply(lambda x: x.set_index('visit_date')\
                        .asfreq(freq='7D', how='start')).groupby(level=['air_genre_name_le', 'dayofweek'])['visitors']

# ####################################################### ПО МЕСТУ И ДНЯМ НЕДЕЛИ
# grouped = statistics.groupby(['air_area_name_le', 'dayofweek', 'visit_date'], as_index=False)['visitors'].sum()
# grouped_AD = grouped.groupby(['air_area_name_le', 'dayofweek']).apply(lambda x: x.set_index('visit_date')\
#                         .asfreq(freq='7D', how='start')).groupby(level=['air_area_name_le', 'dayofweek'])['visitors'] 

# ####################################################### ПО КУХНЕ И ПРАЗДНИКАМ
# grouped = statistics.groupby(['air_genre_name_le', 'holiday', 'visit_date'], as_index=False)['visitors'].sum()
# grouped_GH = grouped.groupby(['air_genre_name_le', 'holiday']).apply(lambda x: x.set_index(['visit_date'])\
#                                 .asfreq(freq='1D', how='start')).groupby(level=['air_genre_name_le', 'holiday'])['visitors']

####################################################### ПО МАГАЗИНУ И ДНЯМ НЕДЕЛЯМ
grouped_SD = statistics.groupby(['air_store_id_le', 'dayofweek']).apply(lambda x: x.set_index('visit_date')\
                        .asfreq(freq='7D', how='start')).groupby(level=['air_store_id_le', 'dayofweek'])['visitors']

####################################################### ПО МАГАЗИНУ И ПРАЗДНИКАМ
grouped_SH = statistics.groupby(['air_store_id_le', 'holiday']).apply(lambda x: x.set_index(['visit_date'])\
                                .asfreq(freq='1D', how='start')).groupby(level=['air_store_id_le', 'holiday'])['visitors']

####################################################### ПО МАГАЗИНУ, ПРАЗДНИКАМ и ДНЯМ НЕДЕЛИ
grouped_SDH = statistics.groupby(['air_store_id_le', 'holiday', 'dayofweek']).apply(lambda x: x.set_index(['visit_date'])\
                        .asfreq(freq='7D', how='start')).groupby(level=['air_store_id_le', 'holiday', 'dayofweek'])['visitors']

####################################################### ПО МАГАЗИНУ
grouped_S = statistics.groupby('air_store_id_le').apply(lambda x: x.set_index('visit_date')) \
                                                .groupby(level=['air_store_id_le'])['visitors']

gc.collect()

CPU times: user 35.3 s, sys: 848 ms, total: 36.2 s
Wall time: 36.1 s


### Недельные

In [18]:
def previous_w(SHIFT_w=1):
    
    statistics_w = df.filter(items=['air_area_name_le', 'air_genre_name_le', 'coordinate_area_le','air_store_id_le', 'year',
                                  'month', 'day', 'dayofweek', 'holiday', 'visitors', 'visit_date'])
    
    ####################################################### ПО КУХНЕ И ДНЯМ НЕДЕЛИ
 
    ### 12 недель
    tmp = grouped_GD.apply(lambda x: x.shift(SHIFT_w).rolling(window=12, min_periods=1)\
                        .agg({'P_mean_GD_12_weeks': 'mean'})).reset_index().drop(['dayofweek'], axis=1)

    statistics_w = pd.merge(statistics_w, tmp, on=['air_genre_name_le', 'visit_date'], how='left')

    ####################################################### ПО МЕСТУ И ДНЯМ НЕДЕЛИ
 
    ### 12 недель
#     tmp = grouped_AD.apply(lambda x: x.SHIFT_w(SHIFT_w).rolling(window=12, min_periods=1)\
#                         .agg({'P_mean_AD_12_weeks': 'mean'})).reset_index().drop(['dayofweek'], axis=1)

#     statistics_w = pd.merge(statistics_w, tmp, on=['air_area_name_le', 'visit_date'], how='left')
    
     
    
    ####################################################### ПО МАГАЗИНУ И ДНЯМ НЕДЕЛЯМ  ########################################
    
    ### 4 недель
    tmp = grouped_SD.apply(lambda x: x.shift(SHIFT_w).rolling(window=4, min_periods=1)\
                              .agg({'P_mean_SD_4_weeks': 'mean'})).reset_index()\
                              .drop(['dayofweek'], axis=1)

    statistics_w = pd.merge(statistics_w, tmp, on=['air_store_id_le', 'visit_date'], how='left')

    ### 12 недель
    tmp = grouped_SD.apply(lambda x: x.shift(SHIFT_w).rolling(window=12, min_periods=1)\
                        .agg({'P_mean_SD_12_weeks': 'mean', 'P_std_SD_12_weeks': 'std'}))\
                        .reset_index().drop(['dayofweek'], axis=1)

    statistics_w = pd.merge(statistics_w, tmp, on=['air_store_id_le', 'visit_date'], how='left')

    ### 24 недель
    tmp = grouped_SD.apply(lambda x: x.shift(SHIFT_w).rolling(window=24, min_periods=1)\
            .agg({'P_mean_SD_24_weeks': 'mean', 'P_hmean_SD_24_weeks': hmean, 'P_std_SD_24_weeks': 'std'}))\
                        .reset_index().drop(['dayofweek'], axis=1) 
#                  'P_hmean_20_weeks_SD': hmean, 'P_median_20_weeks_SD': 'median', 'P_min_20_weeks_SD': 'min',
#                  'P_max_20_weeks_SD': 'max'}))\

    statistics_w = pd.merge(statistics_w, tmp, on=['air_store_id_le', 'visit_date'], how='left')
    
    # EWM    
#     tmp = grouped.apply(lambda x: ewm(pd.Series(x))).SHIFT_w(SHIFT_w*7).reset_index().drop(['dayofweek'], axis=1)\
#                                     .rename(columns={'visitors': 'EWM_SD'})

#     statistics_w = pd.merge(statistics_w, tmp, on=['air_store_id_le', 'visit_date'], how='left')

      
        

    statistics_w.fillna(0, inplace=True)
    statistics_w.drop(['air_area_name_le', 'air_genre_name_le', 'year', 'month', 'day', 'dayofweek', 'holiday', 'visitors', 'coordinate_area_le'], 
                    axis=1, inplace=True)
    gc.collect()
    return statistics_w

### Дневные

In [19]:
def previous_d(SHIFT_d=1):
    
    statistics_d = df.filter(items=['air_area_name_le', 'air_genre_name_le', 'coordinate_area_le','air_store_id_le', 'year',
                                  'month', 'day', 'dayofweek', 'holiday', 'visitors', 'visit_date'])
    
   
    
    ####################################################### ПО КУХНЕ И ПРАЗДНИКАМ

#     tmp = grouped_GH.apply(lambda x: x.SHIFT_d(SHIFT_d*7).rolling(window=180, min_periods=1)\
#                               .agg({'P_mean_GH_180_days': 'mean'})).reset_index()

#     tmp = tmp.merge(idate[['visit_date', 'holiday']], indicator=True, how='left')
#     tmp = tmp[tmp._merge=='both']
#     tmp.drop(['_merge', 'holiday'], axis=1, inplace=True)

#     statistics_d = pd.merge(statistics_d, tmp, on=['air_genre_name_le', 'visit_date'], how='left')    
    


    ####################################################### ПО МАГАЗИНУ И ПРАЗДНИКАМ

    ### 180 дней 
    tmp = grouped_SH.apply(lambda x: x.shift(SHIFT_d).rolling(window=180, min_periods=1)\
                              .agg({'P_mean_SH_180_days': 'mean'})).reset_index()

    tmp = tmp.merge(idate[['visit_date', 'holiday']], indicator=True, how='left')
    tmp = tmp[tmp._merge=='both']
    tmp.drop(['_merge', 'holiday'], axis=1, inplace=True)

    statistics_d = pd.merge(statistics_d, tmp, on=['air_store_id_le', 'visit_date'], how='left')
    
    ####################################################### ПО МАГАЗИНУ, ПРАЗДНИКАМ и ДНЯМ НЕДЕЛИ

    ### 180 дней
    tmp = grouped_SDH.apply(lambda x: x.shift(SHIFT_d).rolling(window=180, min_periods=1)\
                                  .agg({'P_mean_SDH_180_days': 'mean'})).reset_index()

    tmp = tmp.merge(idate[['visit_date', 'holiday', 'dayofweek']], indicator=True, how='left')
    tmp = tmp[tmp._merge=='both']
    tmp.drop(['_merge', 'holiday', 'dayofweek'], axis=1, inplace=True)
    statistics_d = pd.merge(statistics_d, tmp, on=['air_store_id_le', 'visit_date'], how='left')
   

    ####################################################### ПРЕДЫДУЩИЕ ДНИ ПО МАГАЗИНУ

    if SHIFT_d in range(1,15):
        for i in [7, 14, 28]:
            statistics_d = pd.merge(statistics_d, grouped_S.apply(lambda x: x.shift(SHIFT_d).rolling(window=i, min_periods=1) \
                                            .agg({'P_mean_S_'+str(i)+'_days': 'mean'})) \
                                            .reset_index(), on=['air_store_id_le', 'visit_date'], how='left') 
    else:
        for i in [28, 56]:
            statistics_d = pd.merge(statistics_d, grouped_S.apply(lambda x: x.shift(SHIFT_d).rolling(window=i, min_periods=1) \
                                            .agg({'P_mean_S_'+str(i)+'_days': 'mean'})) \
                                            .reset_index(), on=['air_store_id_le', 'visit_date'], how='left')         
        

    statistics_d.fillna(0, inplace=True)
    statistics_d.drop(['air_area_name_le', 'air_genre_name_le', 'year', 'month', 'day', 'dayofweek', 'holiday', 'visitors', 'coordinate_area_le'], 
                    axis=1, inplace=True)

    gc.collect()
    
    return statistics_d

# Валидация

In [21]:
# train = train[train.visit_date>='2016-02-01'].reset_index(drop=True)

In [22]:
# test_idx = []
# for i in range(23,31):
#     test_idx.append(train[(train['visit_date']==str(datetime(2017, 4, i).date()))].index.values)
# for i in range(1,32):
#     test_idx.append(train[(train['visit_date']==str(datetime(2017, 5, i).date()))].index.values)

In [23]:
train_idx = train[train['visit_date']<='2017-03-11'].index.values

test_idx = []
for i in range(12,32):
    test_idx.append(train[(train['visit_date']==str(datetime(2017, 3, i).date()))].index.values)
for i in range(1,20):
    test_idx.append(train[(train['visit_date']==str(datetime(2017, 4, i).date()))].index.values)

feat_to_drop = ['air_store_id','visit_date']

unwanted=[]
# unwanted.extend(train.columns[train.columns.str.startswith('P_std')])

# train['visitors'] = np.log1p(train['visitors'])

In [24]:
train.drop(train.columns[train.columns.str.startswith('T_')], axis=1, inplace=True)
test.drop(test.columns[test.columns.str.startswith('T_')], axis=1, inplace=True)
print(train.shape, test.shape)

X_buff_T = trend(SHIFT=1)
train = pd.merge(train, X_buff_T, on=['air_store_id_le', 'visit_date'], how = 'left')
test = pd.merge(test, X_buff_T, on=['air_store_id_le', 'visit_date'], how = 'left')
del X_buff_T

(249691, 27) (30739, 26)


In [25]:
%%time
np.random.seed(42)

params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression',
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'metric': 'rmse',
    'num_threads': 4
}

MAX_ROUNDS = 2500

print('Start CV...\n')

y_pred=[]
y_val = []

for week in range(1,7):
    

    for day in range(1,8):
        
        delta = (week-1)*7 + day
        
        X_train, X_test = train.iloc[train_idx, :], train.iloc[test_idx[delta-1], :]
    
        X_train.drop('visitors', axis=1, inplace=True)
        X_test.drop('visitors', axis=1, inplace=True)
        y_train, y_test = train['visitors'].iloc[train_idx].values, train['visitors'].iloc[test_idx[delta-1]].values

        if day == 1:
            X_buff_w = previous_w(SHIFT_w=week)
        X_train = pd.merge(X_train, X_buff_w, on=['air_store_id_le', 'visit_date'], how = 'left')
        X_test = pd.merge(X_test, X_buff_w, on=['air_store_id_le', 'visit_date'], how = 'left')


        X_buff_d = previous_d(SHIFT_d=delta)
        X_train = pd.merge(X_train, X_buff_d, on=['air_store_id_le', 'visit_date'], how = 'left')
        X_test = pd.merge(X_test, X_buff_d, on=['air_store_id_le', 'visit_date'], how = 'left')    
        del X_buff_d
        
        X_train.drop(feat_to_drop, axis=1, inplace=True)
        X_test.drop(feat_to_drop, axis=1, inplace=True)
        
        X_train.sort_index(axis=1, inplace=True)
        X_test.sort_index(axis=1, inplace=True)
        

        lgb_train = lgb.Dataset(X_train, label=y_train)
        lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)


        lgbm = lgb.train(params, lgb_train, num_boost_round=MAX_ROUNDS,
                             valid_sets=lgb_eval, early_stopping_rounds=50, verbose_eval=1000)

        y_predict = lgbm.predict(X_test, num_iteration=lgbm.best_iteration or MAX_ROUNDS)
        print('RMSLE is:', round(RMSLE(y_test, y_predict), 6))

        y_val.append(y_test)
        y_pred.append(y_predict)
        
        importance_col = X_train.columns.values.tolist()
        X_train.drop(X_train.columns[X_train.columns.str.startswith('P_')], axis=1, inplace=True)
        X_test.drop(X_test.columns[X_test.columns.str.startswith('P_')], axis=1, inplace=True)

        if delta == 39:
            break
            
    print("\n".join(("%s: %d" % x) for x in sorted(zip(importance_col, lgbm.feature_importance("gain")),
                key=lambda x: x[1], reverse=True)))

    print('RMSLE WEEK:', round(RMSLE(np.concatenate(y_val[(delta-7):delta]), np.concatenate(y_pred[(delta-7):delta])), 6))
    
print('RMSLE VALIDATE:', round(RMSLE(np.concatenate(y_val), np.concatenate(y_pred)), 6))

Start CV...

Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[446]	valid_0's rmse: 0.476666
RMSLE is: 0.476666
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[492]	valid_0's rmse: 0.506889
RMSLE is: 0.506889
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[456]	valid_0's rmse: 0.521562
RMSLE is: 0.521562
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[204]	valid_0's rmse: 0.490928
RMSLE is: 0.490928
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[633]	valid_0's rmse: 0.491644
RMSLE is: 0.491644
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[262]	valid_0's rmse: 0.428341
RMSLE is: 0.428341
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[193]	va

KeyboardInterrupt: 

In [None]:
for i in range(39):
    print(round(RMSLE(y_val[i], y_pred[i]), 6))

In [54]:
round(RMSLE(np.concatenate(y_val[35:100000]), np.concatenate(y_pred[35:100000])), 6)

0.52296299999999996

In [49]:
y_val[35:44]

[array([ 0.69314718,  0.69314718,  3.52636052,  1.94591015,  3.4657359 ,
         1.09861229,  2.48490665,  3.68887945,  3.93182563,  4.15888308,
         2.19722458,  1.09861229,  2.07944154,  3.87120101,  4.04305127,
         1.94591015,  2.30258509,  2.99573227,  3.97029191,  2.56494936,
         3.93182563,  3.21887582,  4.75359019,  3.04452244,  3.40119738,
         2.19722458,  2.63905733,  3.66356165,  2.30258509,  2.30258509,
         3.17805383,  3.13549422,  4.89034913,  1.09861229,  2.83321334,
         3.29583687,  2.83321334,  1.94591015,  2.7080502 ,  3.8286414 ,
         4.00733319,  2.89037176,  1.94591015,  2.56494936,  3.29583687,
         2.56494936,  1.60943791,  3.49650756,  3.33220451,  2.83321334,
         3.04452244,  3.73766962,  3.97029191,  3.13549422,  3.63758616,
         2.94443898,  3.13549422,  3.17805383,  3.36729583,  3.95124372,
         3.91202301,  3.4339872 ,  3.66356165,  1.09861229,  3.52636052,
         3.76120012,  2.89037176,  3.33220451,  3.4

In [23]:
np.min(np.expm1(np.concatenate(y_pred)))

1.7942475437038712

In [None]:
valid = pd.DataFrame()
valid['lgb'] = np.concatenate(y_pred)
valid['lgb'] = valid['lgb'].apply(lambda x: 1 if x<1 else x)

In [28]:
round(RMSLE(np.concatenate(y_val), valid['lgb'].values),6)

0.51554100000000003

# Обучение 

In [21]:
test_idx = []
for i in range(23,31):
    test_idx.append(train[(train['visit_date']==str(datetime(2017, 4, i).date()))].index.values)
for i in range(1,32):
    test_idx.append(train[(train['visit_date']==str(datetime(2017, 5, i).date()))].index.values)

feat_to_drop = ['air_store_id','visit_date']

unwanted=[]
# unwanted.extend(train.columns[train.columns.str.startswith('P_std')])

# train['visitors'] = np.log1p(train['visitors'])

In [20]:
train.drop(train.columns[train.columns.str.startswith('T_')], axis=1, inplace=True)
test.drop(test.columns[test.columns.str.startswith('T_')], axis=1, inplace=True)
print(train.shape, test.shape)

train, test = trend(df_train = train, df_test = test, shift=1)

(249691, 27) (30739, 26)


In [25]:
%%time
np.random.seed(42)

params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression',
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'metric': 'rmse',
    'num_threads': 4
}

MAX_ROUNDS = 1500

print('Start CV...\n')

y_pred=[]

for week in range(1,7):
    
    X_train, X_test = train.copy(), test.iloc[test_idx[week-1], :]
    
    X_train.drop('visitors', axis=1, inplace=True)
    y_train = train['visitors'].values

    X_train, X_test = previous_w(df_train = X_train, df_test = X_test, shift=week)
    
    for day in range(1,8):
        
        delta = (week-1)*7 + day

        X_train, X_test = previous_d(df_train = X_train, df_test = X_test, shift=delta)
        
        X_train.sort_index(axis=1, inplace=True)
        X_test.sort_index(axis=1, inplace=True)

        lgb_train = lgb.Dataset(X_train.drop(feat_to_drop, axis=1), label=y_train)
        lgb_eval = lgb.Dataset(X_test.drop(feat_to_drop, axis=1), label=y_test, reference=lgb_train)


        lgbm = lgb.train(params, lgb_train, num_boost_round=MAX_ROUNDS,
                             valid_sets=lgb_eval, early_stopping_rounds=50, verbose_eval=1000)

        y_predict = lgbm.predict(X_test.drop(feat_to_drop, axis=1), num_iteration=lgbm.best_iteration or MAX_ROUNDS)
        print('RMSLE is:', round(RMSLE(y_test, y_predict), 6))

        y_val.append(y_test)
        y_pred.append(y_predict)
        
        X_train.drop(X_train.columns[X_train.columns.str.startswith('d_')], axis=1, inplace=True)
        X_test.drop(X_test.columns[X_test.columns.str.startswith('d_')], axis=1, inplace=True)

        if delta == 39:
            break
            
    print("\n".join(("%s: %d" % x) for x in sorted(zip(X_train.columns.values.tolist(), lgbm.feature_importance("gain")),
                key=lambda x: x[1], reverse=True)))
    
print('RMSLE VALIDATE:', round(RMSLE(np.concatenate(y_val), np.concatenate(y_pred)), 6))

Patagonia...

P_mean_SD_12_weeks: 488724
P_mean_SD_24_weeks: 192007
P_mean_SD_4_weeks: 61359
P_mean_S_14_days: 19588
P_mean_S_7_days: 16303
T_mean_SDH: 15181
P_hmean_SD_24_weeks: 12075
dayofyear: 9315
P_mean_SH_180_days: 8605
P_mean_S_28_days: 7907
dayofweek: 6428
P_mean_SDH_180_days: 4979
T_std_SH: 4456
dayincomp: 4227
day_before_holiday: 4027
coordinate_area_le: 3822
P_std_SD_24_weeks: 3673
day_in_holiday: 3666
day: 3552
air_store_id_le: 3520
air_genre_name_le: 3451
P_std_SD_12_weeks: 3154
P_mean_GD_12_weeks: 2710
holidays_thisweek: 1938
air_area_name_le: 1441
day_after_holiday: 1384
weekofyear: 982
holidays_lastweek: 825
holiday: 548
month: 473
holidays_nextweek: 387
weekofmonth: 344
day_in_golden_week: 145
quarter: 78
day_after_golden_week: 75
year: 19
day_before_golden_week: 10
golden_week: 0
P_mean_SD_12_weeks: 476324
P_mean_SD_24_weeks: 191799
P_mean_SD_4_weeks: 38129
P_hmean_SD_24_weeks: 29378
P_mean_S_14_days: 17345
dayofyear: 11561
T_mean_SDH: 11429
P_mean_SH_180_days: 10169


In [26]:
np.min(np.expm1(np.concatenate(y_pred)))

1.2150060692343396

In [27]:
for i in range(6):
    test.loc[test_idx[i], 'visitors'] = np.expm1(y_pred[i])
submission = pd.read_csv('data/sample_submission.csv')
submission['air_store_id'] = submission['id'].map(lambda x: '_'.join(x.split('_')[:-1]))
submission['visit_date'] = submission['id'].map(lambda x: x.split('_')[2])
submission['visit_date'] = pd.to_datetime(submission['visit_date'])
submission.drop(['visitors'], axis=1, inplace=True)

submission = pd.merge(submission, test[['air_store_id', 'visit_date', 'visitors']], on=['air_store_id', 'visit_date'], how='left')
submission['visitors'].fillna(0, inplace=True)
submission['visitors'] = submission['visitors'].apply(lambda x: 1 if x<1 else x)
submission[['id', 'visitors']].to_csv('patagonia.csv', index=False, float_format='%.6f')
test.drop(['visitors'], axis=1, inplace=True)

# XGBOOST CV

In [49]:
feats = ['air_store_id_le', 'coordinate_area_le']

def cum_code(X_trn, X_tst, feat_lst, target):
    
    X_trn['target'] = target
    
    for col in feat_lst:

        cumsum = X_trn.groupby(col)['target'].cumsum() - X_trn['target']
        cumcount = X_trn.groupby(col)['target'].cumcount() + 1
        X_trn[col+'_enc'] = cumsum / cumcount     

        X_tst[col+'_enc'] = X_tst[col].map(X_trn.groupby(col)[col+'_enc'].mean().to_dict())

    X_trn.drop('target', axis=1, inplace=True)
    X_trn.drop(feat_lst, axis=1, inplace=True)
    X_tst.drop(feat_lst, axis=1, inplace=True)
    
    return X_trn, X_tst

In [None]:
xgb_2_params = {}
xgb_2_params['max_depth'] = 4
xgb_2_params['learning_rate'] = 0.1
xgb_2_params['n_estimators'] = 400
xgb_2_params['subsample'] = 0.8
xgb_2_params['colsample_bytree'] = 0.8   
xgb_2_params['min_child_weight'] = 6
xgb_2_params['gamma']=10
xgb_2_params['reg_alpha'] = 8
xgb_2_params['reg_lambda'] = 1.3
xgb_2_params['seed'] = 1
xgb_2_params['n_jobs'] = -1

In [29]:
%%time
np.random.seed(42)

params = {
    'max_depth': 6,
    'learning_rate': 0.03,
    'objective': 'reg:linear',
    'gamma': 0,
    'min_child_weight': 20,
    'subsample': 0.9,
    'colsample_bytree': 0.8,
    'scale_pos_weight': 1,
    'seed': 42,
    'eval_metric': 'rmse',
    'n_jobs': -1
}

MAX_ROUNDS = 10000

print('Start CV...\n')

y_pred=[]
y_val = []

for week in range(1,7):
    
    print('==================================== WEEK {} ===================================='.format(week))
    
    if week in [1,2]:
        shift_TREND=1
    else:
        shift_TREND=2
        
    X_train, X_test = train.iloc[train_idx, :], train.iloc[test_idx[week-1], :]
    
    X_train.drop('visitors', axis=1, inplace=True)
    X_test.drop('visitors', axis=1, inplace=True)
    y_train, y_test = train['visitors'].iloc[train_idx].values, train['visitors'].iloc[test_idx[week-1]].values

    X_train, X_test = trend(df_train = X_train, df_test = X_test, shift=shift_TREND)  
    X_train, X_test = previous(df_train = X_train, df_test = X_test, shift=week)
    
    X_train.drop(feat_to_drop, axis=1, inplace=True)
    X_test.drop(feat_to_drop, axis=1, inplace=True)
    
#     X_train.drop(unwanted, axis=1, inplace=True)
#     X_test.drop(unwanted, axis=1, inplace=True)
#     X_train, X_test = cum_code(X_train, X_test, feats, y_train)
   
    X_train.sort_index(axis=1, inplace=True)
    X_test.sort_index(axis=1, inplace=True)
    
    xgb_train = xgb.DMatrix(X_train, y_train)
    xgb_eval = xgb.DMatrix(X_test, y_test)
    watchlist = [(xgb_train, 'train'), (xgb_eval, 'eval')]
    
    gbm = xgb.train(params, xgb_train, num_boost_round=MAX_ROUNDS, evals=watchlist, 
                    early_stopping_rounds=50, verbose_eval=100)
    

    y_predict = gbm.predict(xgb.DMatrix(X_test))
    print('RMSLE is:', round(RMSLE(y_test, y_predict), 6))
    
    y_val.append(y_test)
    y_pred.append(y_predict)

    print("\n".join(("%s: %d" % x) for x in sorted(gbm.get_fscore().items(), key=lambda x:x[1], reverse=True)))
    
print('RMSLE VALIDATE:', round(RMSLE(np.concatenate(y_val), np.concatenate(y_pred)), 6))

Start CV...

[0]	train-rmse:2.36643	eval-rmse:2.40561
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.533589	eval-rmse:0.518282
[200]	train-rmse:0.514336	eval-rmse:0.490197
[300]	train-rmse:0.510115	eval-rmse:0.487766
[400]	train-rmse:0.506769	eval-rmse:0.486348
[500]	train-rmse:0.503568	eval-rmse:0.48574
[600]	train-rmse:0.500599	eval-rmse:0.485251
[700]	train-rmse:0.498085	eval-rmse:0.484882
[800]	train-rmse:0.495579	eval-rmse:0.484588
[900]	train-rmse:0.49341	eval-rmse:0.48446
[1000]	train-rmse:0.491249	eval-rmse:0.484306
Stopping. Best iteration:
[969]	train-rmse:0.491892	eval-rmse:0.484267

RMSLE is: 0.484312
T_std_SH: 2762
P_mean_SH_180_days: 2720
P_std_SD_12_weeks: 2557
air_store_id_le: 2515
P_std_SD_24_weeks: 2287
P_mean_GD_12_weeks: 2248
coordinate_area_le: 2048
P_mean_SD_4_weeks: 1994
P_mean_S_7_days: 1829
P_mean_S_14_days: 1814
dayofyear: 1796
P_mean_SD_12_weeks:

In [32]:
valid['xgb'] = np.concatenate(y_pred)
valid['xgb'] = valid['xgb'].apply(lambda x: 1 if x<1 else x)

In [36]:
round(RMSLE(np.concatenate(y_val), valid.mean(axis=1).values), 6) 

0.51571500000000003

In [None]:
valid['mean'] = df['mean'] = df.mean(axis=1)

# Обучение XGB

In [40]:
test_idx = []
test_idx.append(test[(test['visit_date']>='2017-04-23') & (test['visit_date']<='2017-04-29')].index.values)
test_idx.append(test[(test['visit_date']>='2017-04-30') & (test['visit_date']<='2017-05-06')].index.values)
test_idx.append(test[(test['visit_date']>='2017-05-07') & (test['visit_date']<='2017-05-13')].index.values)
test_idx.append(test[(test['visit_date']>='2017-05-14') & (test['visit_date']<='2017-05-20')].index.values)
test_idx.append(test[(test['visit_date']>='2017-05-21') & (test['visit_date']<='2017-05-27')].index.values)
test_idx.append(test[(test['visit_date']>='2017-05-28') & (test['visit_date']<='2017-05-31')].index.values)

feat_to_drop = ['air_store_id','visit_date']

unwanted=[]
# unwanted.extend(test.columns[test.columns.str.startswith('P_std')])
# unwanted.extend(test.columns[test.columns.str.startswith('T_std')])

# train['visitors'] = np.log1p(train['visitors'])

In [41]:
%%time
np.random.seed(42)

params = {
    'max_depth': 6,
    'learning_rate': 0.03,
    'objective': 'reg:linear',
    'gamma': 0,
    'min_child_weight': 20,
    'subsample': 0.9,
    'colsample_bytree': 0.8,
    'scale_pos_weight': 1,
    'seed': 42,
    'eval_metric': 'rmse',
    'n_jobs': -1
}

MAX_ROUNDS = 1100

print('Patagonia...\n')

y_pred=[]

for week in range(1,7):
    
    print('==================================== WEEK {} ===================================='.format(week))
    
    if week == 1:
        shift_TREND=1
    else:
        shift_TREND=2
        
    X_train, X_test = train.copy(), test.iloc[test_idx[week-1], :]

    X_train.drop('visitors', axis=1, inplace=True)
    y_train = train['visitors'].values

    X_train, X_test = trend(df_train = X_train, df_test = X_test, shift=shift_TREND)  
    X_train, X_test = previous(df_train = X_train, df_test = X_test, shift=week)
    
    X_train.drop(feat_to_drop, axis=1, inplace=True)
    X_test.drop(feat_to_drop, axis=1, inplace=True)
    
#     X_train.drop(unwanted, axis=1, inplace=True)
#     X_test.drop(unwanted, axis=1, inplace=True)
   
    X_train.sort_index(axis=1, inplace=True)
    X_test.sort_index(axis=1, inplace=True)
    
    xgb_train = xgb.DMatrix(X_train, y_train)
    
    gbm = xgb.train(params, xgb_train, num_boost_round=MAX_ROUNDS)    
    
    y_pred.append(gbm.predict(xgb.DMatrix(X_test)))

    print("\n".join(("%s: %d" % x) for x in sorted(gbm.get_fscore().items(), key=lambda x:x[1], reverse=True)))

Patagonia...

T_std_SH: 3153
P_mean_SH_180_days: 2908
P_std_SD_12_weeks: 2801
air_store_id_le: 2758
P_std_SD_24_weeks: 2514
P_mean_GD_12_weeks: 2423
coordinate_area_le: 2289
P_mean_SD_4_weeks: 2243
P_mean_S_7_days: 2071
P_mean_S_14_days: 2055
dayofyear: 2042
P_mean_SD_12_weeks: 2016
dayincomp: 1998
P_mean_S_28_days: 1957
P_hmean_SD_24_weeks: 1874
P_mean_SD_24_weeks: 1627
T_mean_SDH: 1620
P_mean_SDH_180_days: 1526
day: 1512
air_genre_name_le: 1305
dayofweek: 1273
air_area_name_le: 736
day_in_holiday: 512
holidays_thisweek: 378
weekofyear: 330
day_before_holiday: 326
day_after_holiday: 265
holidays_lastweek: 235
holidays_nextweek: 235
month: 104
day_in_golden_week: 99
weekofmonth: 85
day_after_golden_week: 78
holiday: 70
year: 14
quarter: 12
day_before_golden_week: 10
golden_week: 2
air_store_id_le: 3316
T_std_SH: 3191
P_mean_SH_180_days: 2999
P_std_SD_12_weeks: 2774
coordinate_area_le: 2612
P_mean_GD_12_weeks: 2610
P_std_SD_24_weeks: 2585
P_mean_SD_4_weeks: 2226
dayincomp: 2151
dayofyea

# НЕ ИСПОЛЬЗУЕТСЯ

# Бронирования

1. Можно посчитать разницу в бронировании в днях и другие статистики
2. Бронирования день в день в трейне и экстраполировать на тест

In [8]:
# Связываем бронирования hpg_store_id и air_store_id в hreserve. Не связанные удаляем
hr['hpg_store_id'] = hr['hpg_store_id'].map(relation.set_index('hpg_store_id').to_dict()['air_store_id'])
hr.rename(columns={'hpg_store_id': 'air_store_id'}, inplace=True)
hr.dropna(inplace=True)

# Добавляем в areserve бронирования из hreserve
ar = pd.concat([ar, hr])
ar['reserve_visitors'] = np.log1p(ar['reserve_visitors'].values)
ar.shape

(120561, 4)

In [13]:
#Добавляем данные о бронировании
tmp = ar.groupby(['air_store_id', 'visit_datetime'])['reserve_visitors'].sum().reset_index()\
                                                                        .rename(columns={'visit_datetime':'visit_date'})
train = pd.merge(train, tmp, on=['air_store_id', 'visit_date'], how='left').fillna(0)
test = pd.merge(test, tmp, on=['air_store_id', 'visit_date'], how='left').fillna(0)

In [15]:
# В тесте все нулевые - смысла нет
tmp = ar.groupby(['air_store_id', 'reserve_datetime'])['reserve_visitors'].count().reset_index()\
                                .rename(columns={'reserve_datetime':'visit_date', 'reserve_visitors':'reserve_count'})
train = pd.merge(train, tmp, on=['air_store_id', 'visit_date'], how='left').fillna(0)
test = pd.merge(test, tmp, on=['air_store_id', 'visit_date'], how='left').fillna(0)

In [6]:
# idx_split = len(train)
# full = pd.concat([train.drop(['visitors', 'date'], axis=1), test.drop('date', axis=1)], axis=0)
# full = pd.merge(full, ainfo.drop(['latitude', 'longitude'], axis=1), on=['air_store_id'], how='left')
# le = LabelEncoder()
# for col in full.columns:
#     full[col] = le.fit_transform(full[col])
#     train[col] = full[col].iloc[:idx_split].values.astype(np.uint16)
#     test[col] = full[col].iloc[idx_split:].values.astype(np.uint16)
# del full

# Логистическая регрессия

In [64]:
# train_log = pd.get_dummies(train_log, columns=['dayofweek', 'air_store_id_le'], prefix='dum', sparse=False)
train_x = train_log[train_log['visit_date']<='2017-03-14'].drop(['visitors', 'visit_date'], axis=1).values
train_y = train_log[train_log['visit_date']<='2017-03-14']['visitors'].values
test_x = train_log[train_log['visit_date']>='2017-03-15'].drop(['visitors', 'visit_date'], axis=1).values
test_y = train_log[train_log['visit_date']>='2017-03-15']['visitors'].values

In [65]:
log_regressor = LogisticRegression(random_state = 1)
log_regressor.fit(train_x, train_y)
lr_predictions = log_regressor.predict(test_x)

In [66]:
ridge_classifier = RidgeClassifier(random_state = 1)
ridge_classifier.fit(train_x, train_y)
ridge_predictions = ridge_classifier.predict(test_x)

In [67]:
SGD_classifier = SGDClassifier(random_state = 1)
SGD_classifier.fit(train_x, train_y)
SGD_predictions = SGD_classifier.predict(test_x)

In [68]:
print (accuracy_score(test_y, lr_predictions))
print (accuracy_score(test_y, ridge_predictions))
print (accuracy_score(test_y, SGD_predictions))

0.873574025323
0.874294847687
0.874608248715


In [None]:
tmp = statistics.groupby(['air_store_id_le', 'year', 'month', 'dayofweek'], as_index=False)['visitors'].mean()\
                                                                    .rename(columns={'visitors': 'prev_1_month_dow_visitors'})

grouped = tmp.groupby('air_store_id_le').apply(lambda x: x.set_index(['year', 'month', 'dayofweek'])) 

grouped_con=[]
for i in grouped['air_store_id_le'].unique():
    grouped_tmp = grouped[grouped['air_store_id_le']==i]
    bias=grouped_tmp.index.get_level_values('dayofweek').nunique()
    grouped_con.append(grouped_tmp.groupby(level=['air_store_id_le'])['prev_1_month_dow_visitors']\
                       .apply(lambda x: x.shift(bias)).reset_index())
tmp = pd.concat(grouped_con, axis=0)
tmp.reset_index(drop=True, inplace=True)
statistics = pd.merge(statistics, tmp, on=['air_store_id_le', 'year', 'month', 'dayofweek'], how='left')

# Среднее по магазину за предыдущий три месяца по дням неделям
tmp = statistics.groupby(['air_store_id_le', 'year', 'month', 'dayofweek'], as_index=False)['visitors'].mean()\
                                                                    .rename(columns={'visitors': 'prev_3_month_dow_visitors'})

grouped = tmp.groupby('air_store_id_le').apply(lambda x: x.set_index(['year', 'month', 'dayofweek'])) 

grouped_con=[]
for i in grouped['air_store_id_le'].unique():
    grouped_tmp = grouped[grouped['air_store_id_le']==i]
    bias=grouped_tmp.index.get_level_values('dayofweek').nunique()
    grouped_con.append(grouped_tmp.groupby(level=['air_store_id_le'])['prev_3_month_dow_visitors']\
                       .apply(lambda x: x.shift(bias).rolling(window=(bias*3), min_periods=1) \
                                                    .agg({'mean_previous_3_month': 'mean'})).reset_index())
tmp = pd.concat(grouped_con, axis=0)
tmp.reset_index(drop=True, inplace=True)
statistics = pd.merge(statistics, tmp, on=['air_store_id_le', 'year', 'month', 'dayofweek'], how='left')

In [None]:
# tmp = train.groupby(['air_store_id_le'], as_index=False)['visitors'].mean().rename(columns={'visitors': 'mean_S_visitors'})
# train = pd.merge(train, tmp, on='air_store_id_le', how='left')
# test = pd.merge(test, tmp, on='air_store_id_le', how='left')

# for i in range(1,16):
    
#     for store in statistics['air_store_id_le'].unique():
        
#         mask = (statistics['month_in_comp'].isin(range(1,i+1))) & (statistics['air_store_id_le']==store)
#         mean = statistics.loc[mask, 'visitors'].mean()
        
#         mask = (statistics['month_in_comp'] == i+2) & (statistics['air_store_id_le']==store)
#         statistics.loc[mask, 'mean_S'] = mean

In [15]:
shift=2

In [16]:
####################################################### В целом
tmp = statistics.groupby(['year', 'month'], as_index=False)['visitors'].mean()

### среднее за предыдущий месяц
statistics = pd.merge(statistics, 
                      tmp.set_index(['year', 'month']).shift(shift).reset_index()\
                          .rename(columns={'visitors': 'previous_month_visitors'}),
                      on=['year', 'month'], 
                      how='left')


### среднее за предыдущие три месяца
statistics = pd.merge(statistics, 
                      tmp.set_index(['year', 'month']).shift(shift).rolling(window=3, min_periods=1)\
                          .agg('mean').reset_index().rename(columns={'visitors': 'mean_previous_3_month_visitors'}),
                      on=['year', 'month'], 
                      how='left')

####################################################### По магазину
tmp = statistics.groupby(['air_store_id_le', 'year', 'month'], as_index=False)['visitors'].mean()
grouped = tmp.groupby('air_store_id_le').apply(lambda x: x.set_index(['year', 'month'])) \
                                            .groupby(level=['air_store_id_le'])['visitors']
### среднее за предыдущий месяц
statistics = pd.merge(statistics, 
                      grouped.apply(lambda x: x.shift(shift)).reset_index()\
                          .rename(columns={'visitors': 'previous_month_visitors_S'}),
                      on=['air_store_id_le', 'year', 'month'], 
                      how='left')


### среднее за предыдущие три месяца
statistics = pd.merge(statistics, 
                      grouped.apply(lambda x: x.shift(shift).rolling(window=3, min_periods=1) \
                                                    .agg({'mean_previous_3_month_visitors_S': 'mean'})).reset_index(),
                      on=['air_store_id_le', 'year', 'month'], 
                      how='left')

####################################################### По магазину и дням недели
tmp = statistics.groupby(['air_store_id_le', 'dayofweek', 'year', 'month'], as_index=False)['visitors'].mean()
grouped = tmp.groupby('air_store_id_le').apply(lambda x: x.set_index(['dayofweek', 'year', 'month'])) \
                                            .groupby(level=['air_store_id_le', 'dayofweek'])['visitors']
### среднее за предыдущий месяц    
statistics = pd.merge(statistics, 
                      grouped.apply(lambda x: x.shift(shift)).reset_index()\
                          .rename(columns={'visitors': 'previous_month_visitors_SD'}),
                      on=['air_store_id_le', 'year', 'month', 'dayofweek'], 
                      how='left')


### среднее за предыдущие три месяца
statistics = pd.merge(statistics, 
                      grouped.apply(lambda x: x.shift(shift).rolling(window=3, min_periods=1) \
                                                .agg({'mean_previous_3_month_visitors_SD': 'mean'})).reset_index(),
                      on=['air_store_id_le', 'year', 'month', 'dayofweek'], 
                      how='left')

####################################################### По магазину, дням недели и праздникам
tmp = statistics.groupby(['air_store_id_le', 'holiday', 'dayofweek', 'year', 'month'], as_index=False)['visitors'].mean()
grouped = tmp.groupby('air_store_id_le').apply(lambda x: x.set_index(['holiday', 'dayofweek', 'year', 'month'])) \
                                            .groupby(level=['air_store_id_le', 'holiday', 'dayofweek'])['visitors']
### среднее за предыдущий месяц    
statistics = pd.merge(statistics, 
                      grouped.apply(lambda x: x.shift(shift)).reset_index()\
                          .rename(columns={'visitors': 'previous_month_visitors_SDH'}),
                      on=['air_store_id_le', 'year', 'month', 'dayofweek', 'holiday'], 
                      how='left')


### среднее за предыдущие три месяца
statistics = pd.merge(statistics, 
                      grouped.apply(lambda x: x.shift(shift).rolling(window=3, min_periods=1) \
                                                .agg({'mean_previous_3_month_visitors_SDH': 'mean'})).reset_index(),
                      on=['air_store_id_le', 'year', 'month', 'dayofweek', 'holiday'], 
                      how='left')

In [46]:
np.diff(np.array([2,3,4,5,6]))

array([1, 1, 1, 1])

In [48]:
np.mean(np.array([2,3,4,5,6]))

4.0

In [49]:
np.array([2,3,4,5,6]) * np.power(0.95, np.arange(5)[::-1])

array([ 1.6290125,  2.572125 ,  3.61     ,  4.75     ,  6.       ])

In [47]:
np.sum(np.array([2,3,4,5,6]) * np.power(0.95, np.arange(5)[::-1]))

3.7122275

In [None]:
(tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values

In [None]:
    X_train, X_test = train.iloc[train_idx, :], train.iloc[test_idx, :]
    X_train = pd.merge(X_train, weather[['air_store_id', 'visit_date', 'avg_temperature', 'precipitation']], 
                       on=['air_store_id', 'visit_date'], how='left')
    X_test = pd.merge(X_test, weather[['air_store_id', 'visit_date', 'avg_temperature', 'precipitation']], 
                       on=['air_store_id', 'visit_date'], how='left')
    X_train.drop(feat_to_drop, axis=1, inplace=True)
    X_test.drop(feat_to_drop, axis=1, inplace=True)

In [None]:
for i in np.linspace(1.001, 1.002, num=10):
    print(round(RMSLE(np.concatenate(y_val), (np.concatenate(y_pred))*i), 6), i)

In [None]:
    X_train = pd.merge(X_train, ridge[['air_store_id_le', 'visit_date', 'linear']], 
                       on=['air_store_id_le', 'visit_date'], how = 'left').fillna(-999)
    X_test = pd.merge(X_test, ridge[['air_store_id_le', 'visit_date', 'linear']], 
                       on=['air_store_id_le', 'visit_date'], how = 'left').fillna(-999)

## ОБЩИЕ

In [47]:
tmp = train.groupby(['air_store_id_le', 'dayofweek'], as_index=False)['visitors'].mean()\
                                                        .rename(columns={'visitors': 'mean_SDOW_visitors'})
train = pd.merge(train, tmp, on=['air_store_id_le', 'dayofweek'], how='left')
test = pd.merge(test, tmp, on=['air_store_id_le', 'dayofweek'], how='left')

tmp = train.groupby(['air_store_id_le', 'holiday'], as_index=False)['visitors'].mean()\
                                                        .rename(columns={'visitors': 'mean_SHOLIDAY_visitors'})
train = pd.merge(train, tmp, on=['air_store_id_le', 'holiday'], how='left')
test = pd.merge(test, tmp, on=['air_store_id_le', 'holiday'], how='left')

# tmp = train.groupby(['air_store_id_le', 'weekofmonth'], as_index=False)['visitors'].mean()\
#                                                         .rename(columns={'visitors': 'mean_SWOM_visitors'})
# train = pd.merge(train, tmp, on=['air_store_id_le', 'weekofmonth'], how='left')
# test = pd.merge(test, tmp, on=['air_store_id_le', 'weekofmonth'], how='left')

# tmp = train.groupby(['air_store_id_le', 'day'], as_index=False)['visitors'].mean()\
#                                                         .rename(columns={'visitors': 'mean_SDAY_visitors'})
# train = pd.merge(train, tmp, on=['air_store_id_le', 'day'], how='left')
# test = pd.merge(test, tmp, on=['air_store_id_le', 'day'], how='left')

1. Погода 
2. Фичи из SurpriseMe
3. Удалить выбросы
4. Подправить вручную голден вик
5. MAX rounds для каждого свой

In [63]:
ridge.head()

Unnamed: 0,air_store_id_le,dayincomp,dayofweek,holiday,visitors,visit_date,day,month,dayofyear,month_in_comp,linear,dow_sin,dow_cos,month_sin,month_cos,day_sin,day_cos,dayofyear_sin,dayofyear_cos
0,603,13,2,0,3.258097,2016-01-13,13,1,13,1,2.739187,0.974928,-0.222521,0.5,0.866025,0.4067366,-0.913545,0.221325,0.9752
1,603,14,3,0,3.496508,2016-01-14,14,1,14,1,2.918769,0.433884,-0.900969,0.5,0.866025,0.2079117,-0.978148,0.238033,0.971257
2,603,15,4,0,3.401197,2016-01-15,15,1,15,1,2.864751,-0.433884,-0.900969,0.5,0.866025,1.224647e-16,-1.0,0.254671,0.967028
3,603,16,5,0,3.135494,2016-01-16,16,1,16,1,2.99959,-0.974928,-0.222521,0.5,0.866025,-0.2079117,-0.978148,0.271234,0.962513
4,603,18,0,0,1.94591,2016-01-18,18,1,18,1,2.595558,0.0,1.0,0.5,0.866025,-0.5877853,-0.809017,0.304115,0.952635


In [70]:

from sklearn.linear_model import Ridge
# scaler = MinMaxScaler()
scaler = StandardScaler()

ridge = df.filter(items=['air_store_id_le', 'dayincomp', 'dayofweek', 'holiday', 'visitors', 'visit_date',
                        'day', 'month', 'dayofyear', 'year'])
ridge['month_in_comp'] = ridge['visit_date'].apply(lambda x: x.month if x.year==2016 else x.month+12)
ridge['linear'] = 0

ridge['dow_sin'] = np.sin((ridge.dayofweek)*(2.*np.pi/7))
ridge['dow_cos'] = np.cos((ridge.dayofweek)*(2.*np.pi/7))

ridge['month_sin'] = np.sin((ridge.month)*(2.*np.pi/12))
ridge['month_cos'] = np.cos((ridge.month)*(2.*np.pi/12))

ridge['day_sin'] = np.sin((ridge.dayofyear)*(2.*np.pi/30))
ridge['day_cos'] = np.cos((ridge.dayofyear)*(2.*np.pi/30))

ridge['dayofyear_sin'] = np.sin((ridge.dayofyear)*(2.*np.pi/366))
ridge['dayofyear_cos'] = np.cos((ridge.dayofyear)*(2.*np.pi/366))

In [73]:
for i in range(1,15):
    
    for store in ridge['air_store_id_le'].unique():
        
        train_idx = ridge[(ridge['month_in_comp'].isin([i, i+1, i+2])) & ridge['air_store_id_le']==store].index.values
        test_idx = ridge[(ridge['month_in_comp'].isin([i+3])) & ridge['air_store_id_le']==store].index.values
        
        if len(train_idx) > 0:
        
            X_tmp_train = ridge.iloc[train_idx]
            X_tmp_test = ridge.iloc[test_idx]

            X_tmp_train = X_tmp_train.filter(items=['year', 'dayincomp', 'dow_sin', 'dow_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos',
                        'dayofyear_sin', 'dayofyear_cos', 'holiday', 'visitors'])
            X_tmp_test = X_tmp_test.filter(items=['year', 'dayincomp', 'dow_sin', 'dow_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos',
                        'dayofyear_sin', 'dayofyear_cos', 'holiday', 'visitors'])


#             X_tmp_train[['dayincomp', 'dayofweek']] = scaler.fit_transform(X_tmp_train[['dayincomp', 'dayofweek']])
#             X_tmp_test[['dayincomp', 'dayofweek']] = scaler.transform(X_tmp_test[['dayincomp', 'dayofweek']])

#             X_tmp_train[['dayincomp', 'dayofweek']] = scaler.fit_transform(X_tmp_train[['dayincomp', 'dayofweek']])
#             X_tmp_test[['dayincomp', 'dayofweek']] = scaler.transform(X_tmp_test[['dayincomp', 'dayofweek']])
        
            for col in ['year', 'dayincomp', 'dow_sin', 'dow_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos',
                        'dayofyear_sin', 'dayofyear_cos']:
                X_tmp_train[[col]] = scaler.fit_transform(X_tmp_train[[col]])
                X_tmp_test[[col]] = scaler.transform(X_tmp_test[[col]])
#     X_tmp_train[['year', 'dayincomp', 'dow_sin', 'dow_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos',
#                         'dayofyear_sin', 'dayofyear_cos']] = scaler.fit_transform(X_tmp_train[['year','dayincomp', 'dow_sin', 'dow_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos',
#                         'dayofyear_sin', 'dayofyear_cos']])
#             X_tmp_test[['year','dayincomp', 'dow_sin', 'dow_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos',
#                         'dayofyear_sin', 'dayofyear_cos']] = scaler.transform(X_tmp_test[['year','dayincomp', 'dow_sin', 'dow_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos',
#                         'dayofyear_sin', 'dayofyear_cos']])

            clf = Ridge()
#             clf = KNeighborsRegressor(n_jobs=-1, n_neighbors=14)
            clf.fit(X_tmp_train.drop('visitors', axis=1), X_tmp_train['visitors'].values)


            ridge['linear'].iloc[test_idx] = clf.predict(X_tmp_test.drop('visitors', axis=1))

In [None]:
X_train.columns

In [27]:
train_idx = train[train['visit_date']<='2017-03-11'].index.values

test_idx = []
test_idx.append(train[(train['visit_date']>='2017-03-12') & (train['visit_date']<='2017-03-18')].index.values)
test_idx.append(train[(train['visit_date']>='2017-03-19') & (train['visit_date']<='2017-03-25')].index.values)
test_idx.append(train[(train['visit_date']>='2017-03-26') & (train['visit_date']<='2017-04-01')].index.values)
test_idx.append(train[(train['visit_date']>='2017-04-02') & (train['visit_date']<='2017-04-08')].index.values)
test_idx.append(train[(train['visit_date']>='2017-04-09') & (train['visit_date']<='2017-04-15')].index.values)
test_idx.append(train[(train['visit_date']>='2017-04-16') & (train['visit_date']<='2017-04-19')].index.values)

feat_to_drop = ['air_store_id','visit_date']

unwanted=['year', 'quarter']
# unwanted.extend(train.columns[train.columns.str.startswith('P_std')])

# train['visitors'] = np.log1p(train['visitors'])

In [28]:
train.drop(train.columns[train.columns.str.startswith('T_')], axis=1, inplace=True)
test.drop(test.columns[test.columns.str.startswith('T_')], axis=1, inplace=True)
print(train.shape, test.shape)

train, test = trend(df_train = train, df_test = test, shift=1)

(249691, 27) (30739, 26)


In [None]:
%%time
np.random.seed(42)
scaler = StandardScaler()
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression',
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'metric': 'rmse',
    'num_threads': 4
}

MAX_ROUNDS = 2500

print('Start CV...\n')

y_pred=[]
y_val = []

for week in range(1,7):
    
    print('==================================== WEEK {} ===================================='.format(week))
    
#     if week in [1,2]:
#         shift_TREND=1
#     else:
#         shift_TREND=2
        
    X_train, X_test = train.iloc[train_idx, :], train.iloc[test_idx[week-1], :]
    
    X_train.drop('visitors', axis=1, inplace=True)
    X_test.drop('visitors', axis=1, inplace=True)
    y_train, y_test = train['visitors'].iloc[train_idx].values, train['visitors'].iloc[test_idx[week-1]].values

    X_train, X_test = previous(df_train = X_train, df_test = X_test, shift=week)

    
    X_train.drop(feat_to_drop, axis=1, inplace=True)
    X_test.drop(feat_to_drop, axis=1, inplace=True)
    
#     X_train.drop(unwanted, axis=1, inplace=True)
#     X_test.drop(unwanted, axis=1, inplace=True)
   
    X_train.sort_index(axis=1, inplace=True)
    X_test.sort_index(axis=1, inplace=True)
##################################################
    scalelst = [c for c in X_train.columns]
    for col in scalelst:
        X_train[col] = scaler.fit_transform(X_train[[col]])
        X_test[col] = scaler.transform(X_test[[col]])
##################################################
    clf = KNeighborsRegressor(n_jobs=-1, n_neighbors=5)
    clf.fit(X_train, X_test)

    y_predict = clf.predict(X_test)
    print('RMSLE is:', round(RMSLE(y_test, y_predict), 6))
    
    y_val.append(y_test)
    y_pred.append(y_predict)

    print("\n".join(("%s: %d" % x) for x in sorted(zip(X_train.columns.values.tolist(), lgbm.feature_importance("gain")),
                key=lambda x: x[1], reverse=True)))
    
print('RMSLE VALIDATE:', round(RMSLE(np.concatenate(y_val), np.concatenate(y_pred)), 6))

Start CV...

