import tools & data

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from itertools import product
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from xgboost import plot_importance

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)
    
from datetime import datetime
import time
import sys
import gc
import pickle
sys.version_info

import joblib

In [None]:
df_train = pd.read_csv("../input/rossmann-store-sales/train.csv")
df_test = pd.read_csv("../input/rossmann-store-sales/test.csv")
store = pd.read_csv("../input/rossmann-store-sales/store.csv")

merge data

In [None]:
df_train['Id'] = 0
df_train['data_type'] = 1

df_train.reindex(sorted(df_train.columns), axis='columns')

In [None]:
df_test['Customers'] = 9999
df_test['Sales'] = 0
df_test['data_type'] = 2

df_test.reindex(sorted(df_test.columns), axis='columns')

In [None]:
df = pd.concat([df_train, df_test])
df = pd.merge(df, store, how='left', on='Store')

In [None]:
df.count()

In [None]:
# 減少記憶體用量
del df_train
del df_test
del store

In [None]:
df.dtypes

data processing

In [None]:
#時間處理

def is_weekend(dates):
    results = []
    for date_value in pd.DatetimeIndex(dates.values):
        weekno = date_value.weekday()
        result = 0 if weekno < 5 else 1
        results.append(result)
    return results


date_to_season_mapping = {1: [12, 2], 2: [3, 5], 3: [6, 8], 4: [9, 11]}


def date_to_season(dates):
    results = []
    date_values = dates.values
    for date in date_values:
        month = int(date.split('-')[1])
        result = 'None'
        for each_season in date_to_season_mapping:
            start, end = date_to_season_mapping[each_season]
            if ((start < end) and (start <= month <= end)) or \
               ((start > end) and ((month >= start) or (month <= end))):
                result = each_season
                break

        results.append(result)
    return results

month_no_to_name_mapping = [
    1,2,3,4,5,6,7,8,9,10,11,12
]

def date_to_month_name(dates):
    month_values = pd.DatetimeIndex(dates).month.values
    results = []
    for month in month_values:
        result = month_no_to_name_mapping[month - 1]
        results.append(result)
    return results

# def weekday_or_weekend(dates):
#     results = []
#     for date_value in pd.DatetimeIndex(dates.values):
#         weekno = date_value.weekday()
#         result = "Weekday" if weekno < 5 else "Weekend"
#         results.append(result)
#     return results

# def weekday(dates):
#     results = []
#     for date_value in pd.DatetimeIndex(dates.values):
#         weekno = date_value.weekday()
#         result = weekno
#         results.append(result)
#     return results


import holidays
holidays_usa = holidays.USA()

def is_holiday(dates):
    results = []
    for date_value in pd.DatetimeIndex(dates.values):
        result = 1 if date_value.date() in holidays_usa else 0
        results.append(result)
    return results


# date_to_day_period_mapping = {'Morning': [4, 11], 'Afternoon': [12, 17], 
#                               'Evening': [18, 19], 'Night': [20, 4]}
# def date_to_day_period(datetimes):
#     results = []
#     datetime_values = datetimes.values
#     for datetime in datetime_values:
#         _, time_of_day = datetime.split(' ')
#         hour, _, _ = time_of_day.split(':')
#         hour = int(hour)
#         result = 'None'
#         for each_day_period in date_to_day_period_mapping:
#             start, end = date_to_day_period_mapping[each_day_period]
#             if ((start < end) and (start <= hour <= end)) or \
#                ((start > end) and ((hour >= start) or (hour <= end))):
#                 result = each_day_period
#                 break

#         results.append(result)
#     return results


In [None]:
df['is_weekend'] = is_weekend(df['Date'])
df['Month'] = date_to_month_name(df['Date'])
df['Year'] = pd.DatetimeIndex(df['Date']).year
df['Season'] = date_to_season(df['Date'])
df['is_holiday'] = is_holiday(df['Date'])

df.head()

In [None]:
def lag_feature(df, lags, col):
    tmp = df[['num_of_week','Store',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['num_of_week','Store', col+'_lag_'+str(i)]
        shifted['num_of_week'] += i
        shifted = shifted.groupby(['num_of_week','Store'], as_index=False).agg({col+'_lag_'+str(i): 'mean'})
        df = pd.merge(df, shifted, on=['num_of_week','Store'], how='left')
    return df

df['date'] = df['Date'].apply( lambda x: np.datetime64(x))
df['num_of_week'] = (df.date.dt.year-2013)*52 + df.date.dt.week

ts = time.time()
df = lag_feature(df, [4,13,26,52], 'Sales')
time.time() - ts

In [None]:
# df['date'] = df['Date'].apply( lambda x: np.datetime64(x))
df['yyyymm'] = df['date'].dt.strftime('%Y%m')
df['last_year'] = df['date'] - np.timedelta64(1, 'Y')
df['last_yyyymm'] = df['last_year'].dt.strftime('%Y%m')

tmp = df.groupby(['Store', 'yyyymm'])['Customers'].mean().reset_index()
df = df.merge(tmp, left_on = ['Store','last_yyyymm'], right_on = ['Store', 'yyyymm'], how ='left')

df.drop('yyyymm_y', axis=1, inplace=True)
df = df.rename(columns= {'Customers_x':'Customers', 'Customers_y':'last_Customers', 'yyyymm_x':'yyyymm'})

one-hot encoding

In [None]:
df.dtypes

In [None]:
# 有重複數值須先處理
df['StateHoliday'] = np.where(df['StateHoliday']=='a', 'a', 
                              np.where(df['StateHoliday']=='b', 'b',
                                      np.where(df['StateHoliday']=='c', 'c', 0)
                             ))

In [None]:
cat_col = [
    'StateHoliday',
    'StoreType',
    'Assortment',
    'PromoInterval'
]

# SchoolHiliday 只有1/0不做one-hot

In [None]:
df_cat = pd.get_dummies(df[cat_col])
df_cat = df_cat.astype(float)

In [None]:
num_col = list(set(df.columns.tolist())-set(cat_col))
df_num = df[num_col]

In [None]:
print(df_cat.shape)
print(df_num.shape)

In [None]:
df_all = pd.concat([df_num, df_cat], axis=1)

In [None]:
df_all.head()

In [None]:
#dummy

# data_temp = df.copy()
# data_temp.drop(columns = ['Date'], inplace=True)

# description = pd.DataFrame(index=['observations(rows)', 'percent missing', 'dtype', 'range'])
# numerical = []
# categorical = []
# for col in data_temp.columns:
#     obs = data_temp[col].size
#     p_nan = round(data_temp[col].isna().sum()/obs, 2)
#     num_nan = f'{p_nan}% ({data_temp[col].isna().sum()}/{obs})'
#     dtype = 'categorical' if data_temp[col].dtype == object else 'numerical'
#     numerical.append(col) if dtype == ['numerical','int64'] else categorical.append(col)
#     rng = f'{len(data_temp[col].unique())} labels' if dtype == 'categorical' else f'{data_temp[col].min()}-{data_temp[col].max()}'
#     description[col] = [obs, num_nan, dtype, rng]

# data_num = data_temp.copy()    
# data_num.drop(columns = categorical, inplace=True)

# data_dummy = pd.get_dummies(data_temp[categorical], drop_first=True)
# data_dummy.head()
# display(description)



# data_dummy.head()


modeling

In [None]:
exclude_col = [
    'Date',
    'date',
    'yyyymm',
    'last_year',
    'last_yyyymm',
    'data_type',
    'Id',
    'Customers'
]

y_col = ['Sales']

In [None]:
x_col = list(set(df_all.columns.tolist())-set(exclude_col)-set(y_col))

In [None]:
x_train = df_all[df_all['data_type']==1][x_col]
y_train = df_all[df_all['data_type']==1][y_col]

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size = 0.3, random_state = 1)

In [None]:
ts = time.time()

model = XGBRegressor(
    max_depth=7,
    n_estimators=1000,
    min_child_weight=100, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.1,    
    seed=42)

model.fit(
    x_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(x_train, y_train), (x_valid, y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

time.time() - ts

predict

In [None]:
x_test = df_all[df_all['data_type']==2][x_col]

In [None]:
y_pred = model.predict(x_test)

In [None]:
y_pred

submission

In [None]:
df_y = pd.DataFrame()
df_y['Id'] = df_all[df_all['data_type']==2]['Id']
df_y['Sales'] = y_pred
df_y.to_csv('submission.csv', index=False)

feature importance

In [None]:
plot_features(model, (10,14))