In [None]:
import numpy as np
import pandas as pd
from sklearn import *
from datetime import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.set_option('display.max_columns', None)

In [None]:
import tsfresh
df = pd.read_csv('../data/air_visit_data.csv.zip',)
extracted_features = tsfresh.extract_features(df, 
                            column_id="air_store_id", column_sort="visit_date")

In [None]:
# Препроцессинг из https://www.kaggle.com/jmbull/no-xgb-starter-here-s-one-lb-507

data = {
    'tra': pd.read_csv('../data/Recruit Restaurant Visitor Forecasting/air_visit_data.csv.zip'),
    'as': pd.read_csv('../data/Recruit Restaurant Visitor Forecasting/air_store_info.csv.zip'),
    'hs': pd.read_csv('../data/Recruit Restaurant Visitor Forecasting/hpg_store_info.csv.zip'),
    'ar': pd.read_csv('../data/Recruit Restaurant Visitor Forecasting/air_reserve.csv.zip'),
    'hr': pd.read_csv('../data/Recruit Restaurant Visitor Forecasting/hpg_reserve.csv.zip'),
    'id': pd.read_csv('../data/Recruit Restaurant Visitor Forecasting/store_id_relation.csv.zip'),
    'tes': pd.read_csv('../data/Recruit Restaurant Visitor Forecasting/sample_submission.csv.zip'),
    'hol': pd.read_csv('../data/Recruit Restaurant Visitor Forecasting/date_info.csv.zip').rename(columns={'calendar_date':'visit_date'})
    }

data['hr'] = pd.merge(data['hr'], data['id'], how='inner', on=['hpg_store_id'])

for df in ['ar','hr']:
    data[df]['visit_datetime'] = pd.to_datetime(data[df]['visit_datetime'])
    data[df]['visit_datetime'] = data[df]['visit_datetime'].dt.date
    data[df]['reserve_datetime'] = pd.to_datetime(data[df]['reserve_datetime'])
    data[df]['reserve_datetime'] = data[df]['reserve_datetime'].dt.date
    data[df]['reserve_datetime_diff'] = data[df].apply(lambda r: (r['visit_datetime'] - r['reserve_datetime']).days, axis=1)
    data[df] = data[df].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns={'visit_datetime':'visit_date'})

data['tra']['visit_date'] = pd.to_datetime(data['tra']['visit_date'])
data['tra']['dow'] = data['tra']['visit_date'].dt.dayofweek
data['tra']['year'] = data['tra']['visit_date'].dt.year
data['tra']['month'] = data['tra']['visit_date'].dt.month
data['tra']['visit_date'] = data['tra']['visit_date'].dt.date

data['tes']['visit_date'] = data['tes']['id'].map(lambda x: str(x).split('_')[2])
data['tes']['air_store_id'] = data['tes']['id'].map(lambda x: '_'.join(x.split('_')[:2]))
data['tes']['visit_date'] = pd.to_datetime(data['tes']['visit_date'])
data['tes']['dow'] = data['tes']['visit_date'].dt.dayofweek
data['tes']['year'] = data['tes']['visit_date'].dt.year
data['tes']['month'] = data['tes']['visit_date'].dt.month
data['tes']['visit_date'] = data['tes']['visit_date'].dt.date

unique_stores = data['tes']['air_store_id'].unique()
stores = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'dow': [i]*len(unique_stores)}) for i in range(7)], axis=0, ignore_index=True).reset_index(drop=True)

#sure it can be compressed...
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].min().rename(columns={'visitors':'min_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].mean().rename(columns={'visitors':'mean_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].median().rename(columns={'visitors':'median_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].max().rename(columns={'visitors':'max_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].count().rename(columns={'visitors':'count_observations'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 

stores = pd.merge(stores, data['as'], how='left', on=['air_store_id']) 
lbl = preprocessing.LabelEncoder()
stores['air_genre_name'] = lbl.fit_transform(stores['air_genre_name'])
stores['air_area_name'] = lbl.fit_transform(stores['air_area_name'])

data['hol']['visit_date'] = pd.to_datetime(data['hol']['visit_date'])
data['hol']['day_of_week'] = lbl.fit_transform(data['hol']['day_of_week'])
data['hol']['visit_date'] = data['hol']['visit_date'].dt.date

data['tra'] = pd.merge(data['tra'], data['hol'], how='left', on=['visit_date'])
data['tes'] = pd.merge(data['tes'], data['hol'], how='left', on=['visit_date'])

train = pd.merge(data['tra'], stores, how='left', on=['air_store_id','dow']) 
test = pd.merge(data['tes'], stores, how='left', on=['air_store_id','dow'])

for df in ['ar','hr']:
    train = pd.merge(train, data[df], how='left', on=['air_store_id','visit_date']) 
    test = pd.merge(test, data[df], how='left', on=['air_store_id','visit_date'])

col = [c for c in train if c not in ['id', 'air_store_id','visit_date','visitors']]
train = train.fillna(-1)
test = test.fillna(-1)

In [None]:
extracted_features = extracted_features.reset_index()

In [None]:
extracted_features = extracted_features.rename(columns={'id':'air_store_id'})

In [None]:
extracted_features.fillna(extracted_features.mean(), inplace=True)

In [None]:
des = extracted_features.describe()

In [None]:
dropcols = [col for col in des.columns if des.loc['std', col]==0]

In [None]:
des

In [None]:
e_f = extracted_features.drop(dropcols, axis=1)

In [None]:
e_f.shape

In [None]:
train = pd.merge(train, extracted_features, on='air_store_id', how='left')

In [None]:
train.head(10)

In [None]:
train.isnull().sum().sum()

In [None]:
train['visit_date'] = pd.to_datetime(train['visit_date'])

In [None]:
X_train = train[train['visit_date'] < datetime(2017, 1, 17)].reset_index(drop=True)
X_test = train[train['visit_date'] >= datetime(2017, 1, 17)].reset_index(drop=True)
ind = X_test[X_test['air_store_id'] == 'air_cb7467aed805e7fe'].index

In [None]:
len(ind)

In [None]:
y_train = X_train['visitors'].values
y_test = X_test['visitors'].values
X_train = X_train.drop(['air_store_id','visitors', 'visit_date'], axis=1)
X_test = X_test.drop(['air_store_id','visitors', 'visit_date'], axis=1)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
mms = MinMaxScaler()
mms.fit(X_train)
X_train = mms.transform(X_train)
X_test = mms.transform(X_test)

In [None]:
model = Ridge(10)

In [None]:
model.fit(X_train, y_train)

In [None]:
preds = model.predict(X_test)

In [None]:
score(y_test, preds)**(1/2)

In [None]:
ind

In [None]:
score(y_test[ind], preds[ind])**(1/2)

In [None]:
sns.set(font_scale=1)

fig = plt.figure(figsize=(25, 5))
plt.plot(y_test[ind], label='real')
plt.plot(preds[ind], color='red', label='pred')
plt.legend()
plt.show()