In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install flaml
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flaml
  Downloading FLAML-1.0.9-py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 27.1 MB/s 
Collecting lightgbm>=2.3.1
  Downloading lightgbm-3.3.2-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 51.8 MB/s 
Installing collected packages: lightgbm, flaml
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 2.2.3
    Uninstalling lightgbm-2.2.3:
      Successfully uninstalled lightgbm-2.2.3
Successfully installed flaml-1.0.9 lightgbm-3.3.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.5 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6


In [3]:
import pandas as pd
import matplotlib.pyplot as plt

import xgboost
import warnings
from sklearn.metrics import mean_squared_error
import random as rn
import numpy as np
import os
from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from mlxtend.regressor import StackingCVRegressor

from collections import Counter
from flaml import AutoML


%matplotlib inline
warnings.filterwarnings(action='ignore')
%cd '/content/drive/MyDrive/Shopping_sales_revenue/'

/content/drive/MyDrive/Shopping_sales_revenue


In [4]:
trainset = pd.read_csv("./dataset/train.csv")
testset = pd.read_csv("./dataset/test.csv")
sample_submission = pd.read_csv("./dataset/sample_submission.csv")

In [5]:
# reproducibility
seed_num = 42

def set_seed(seed_num): 
  # tf.random.set_seed(seed_num)
  np.random.seed(seed_num)
  rn.seed(seed_num)
  os.environ['PYTHONHASHSEED']=str(seed_num)

In [6]:
def holiday_to_number(isholiday):
    if isholiday == True:
        number = 1
    else:
        number = 0
    return number


def preprocessing(data):
    data = data.copy()
    data.Date = pd.to_datetime(data.Date, format="%d/%m/%Y")
    
    data['Week'] = data.Date.dt.isocalendar().week.apply(lambda x: int(x))
    data['Day'] = data.Date.dt.day.apply(lambda x: int(x))
    data['Year'] = data.Date.dt.year.apply(lambda x: int(x))
    data['Month'] = data.Date.dt.month.apply(lambda x: int(x))
    
    # True/False => 1/0
    data['NumberHoliday'] = data['IsHoliday'].apply(holiday_to_number)
    
    return data

In [7]:
# 데이터 전처리
trainset = preprocessing(trainset)
testset = preprocessing(testset)

In [8]:
trainset.head()

Unnamed: 0,id,Store,Date,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Weekly_Sales,Week,Day,Year,Month,NumberHoliday
0,1,1,2010-02-05,42.31,2.572,,,,,,8.106,False,1643690.9,5,5,2010,2,0
1,2,1,2010-02-12,38.51,2.548,,,,,,8.106,True,1641957.44,6,12,2010,2,1
2,3,1,2010-02-19,39.93,2.514,,,,,,8.106,False,1611968.17,7,19,2010,2,0
3,4,1,2010-02-26,46.63,2.561,,,,,,8.106,False,1409727.59,8,26,2010,2,0
4,5,1,2010-03-05,46.5,2.625,,,,,,8.106,False,1554806.68,9,5,2010,3,0


In [9]:
testset.head()

Unnamed: 0,id,Store,Date,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Week,Day,Year,Month,NumberHoliday
0,1,1,2012-10-05,68.55,3.617,8077.89,,18.22,3617.43,3626.14,6.573,False,40,5,2012,10,0
1,2,1,2012-10-12,62.99,3.601,2086.18,,8.11,602.36,5926.45,6.573,False,41,12,2012,10,0
2,3,1,2012-10-19,67.97,3.594,950.33,,4.93,80.25,2312.85,6.573,False,42,19,2012,10,0
3,4,1,2012-10-26,69.16,3.506,2585.85,31.75,6.0,1057.16,1305.01,6.573,False,43,26,2012,10,0
4,5,2,2012-10-05,70.27,3.617,6037.76,,10.04,3027.37,3853.4,6.17,False,40,5,2012,10,0


### 기본 전략
- 지점별로 예측 모델을 생성하자.
- 기준 연도(2010 또는 2011)와 2012년도 데이터를 이용하자.
- 기준 연도가 아닌 연도의 데이터는 과감하게 제외
- 2010, 2011 각각의 연도는 ~9월까지의 데이터, 2012년도는 8월까지의 데이터를 이용하여 학습하고 2012년도의 9월 데이터를 더 잘 예측하는 연도를 선택하자.
- 선택한 기준연도에 대해 ~10월까지의 데이터, 2012년도는 9월까지의 데이터를 이용하여 학습하고 최종적으로 2012년도 10월의 데이터를 예측해보자.

In [10]:
def RMSE(y, y_pred):
    return mean_squared_error(y, y_pred)**0.5

In [39]:
features = ['Store', 'NumberHoliday', 'Week', 'Day', 'Month', 'Year']

MODEL_TIME_BUDGET = 60*5
MODEL_METRIC = 'rmse'
MODEL_TASK = "regression"
MODEL_LIST = ["xgboost"]


params = {
    "time_budget": MODEL_TIME_BUDGET,  
    "metric": MODEL_METRIC,
    "estimator_list": MODEL_LIST, 
    "task": MODEL_TASK,
    "seed":seed_num,
    "verbose":0
}

In [40]:
base_params = {
    "n_estimators": 60,
    "min_child_weight": 3,
    "max_depth": 6
}

In [None]:
similar_years = []

for store in range(1,max(trainset.Store)+1):  # max(trainset.Store)+1
    train_store = trainset[trainset.Store==store]
    
    # 2010, 2011, 2012 년도 별로 데이터 분리
    # 2012-09에 대해 예측하려고 하기 때문에 2012년도는 9월을 포함하지 않음
    train_store_2010 = train_store[(train_store.Year==2010) & (train_store.Month<=9)]
    train_store_2011 = train_store[(train_store.Year==2011) & (train_store.Month<=9)]
    train_store_2012 = train_store[(train_store.Year==2012) & (train_store.Month<9)]
    
    # 2011, 2010 년도를 제외한 데이터 생성
    train_store_2010_2012 = pd.concat([train_store_2010, train_store_2012])
    train_store_2011_2012 = pd.concat([train_store_2011, train_store_2012])
    
    # 각각의 모델 학습
    model_2010_2012 = xgboost.XGBRegressor(**base_params)
    model_2010_2012.fit(train_store_2010_2012[features],
                        train_store_2010_2012.Weekly_Sales)
    
    model_2011_2012 = xgboost.XGBRegressor(**base_params)
    model_2011_2012.fit(train_store_2011_2012[features],
                        train_store_2011_2012.Weekly_Sales)
    
    # 2012년도 9월에 대해서 예측
    x_test = train_store[(train_store.Year==2012) & (train_store.Month==9)]
    pred_2010_2012 = model_2010_2012.predict(x_test[features])
    pred_2011_2012 = model_2011_2012.predict(x_test[features])
    
    # 예측 결과 평가
    rmse_2010_2012 = RMSE(pred_2010_2012, x_test.Weekly_Sales)
    rmse_2011_2012 = RMSE(pred_2011_2012, x_test.Weekly_Sales)

    
    # 오차가 더 적은 연도를 선택
    similar_year = 2010
    if rmse_2010_2012 > rmse_2011_2012:
        similar_year = 2011

    print(f"{store:02}", similar_year, rmse_2010_2012, rmse_2011_2012)

    similar_years.append(similar_year)
    print(store,"-th done")

In [42]:
print(similar_years)

[2011, 2010, 2010, 2010, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2010, 2011, 2011, 2011, 2011, 2010, 2010, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2010, 2011, 2010, 2011, 2010, 2011, 2011, 2011, 2010, 2011, 2010, 2011, 2011, 2011, 2011, 2011, 2011, 2011]


In [43]:
models = []

for store in range(1,max(trainset.Store)+1):  # max(trainset.Store)+1
    
    # 전체 데이터로 학습
    train_store_target = pd.concat([
        train_store[(train_store.Year==similar_years[store-1]) & (train_store.Month<=10)],
        train_store[(train_store.Year==2012)]
    ])

    auto_xgb = AutoML()
    auto_xgb.fit(train_store_target[features], train_store_target.Weekly_Sales, **params)
    best_xgb = auto_xgb.model.estimator

    models.append(best_xgb)
    print(store,"-th done")

1 -th done
2 -th done
3 -th done
4 -th done
5 -th done
6 -th done
7 -th done
8 -th done
9 -th done
10 -th done
11 -th done
12 -th done
13 -th done
14 -th done
15 -th done
16 -th done
17 -th done
18 -th done
19 -th done
20 -th done
21 -th done
22 -th done
23 -th done
24 -th done
25 -th done
26 -th done
27 -th done
28 -th done
29 -th done
30 -th done
31 -th done
32 -th done
33 -th done
34 -th done
35 -th done
36 -th done
37 -th done
38 -th done
39 -th done
40 -th done
41 -th done
42 -th done
43 -th done
44 -th done
45 -th done


In [50]:
pred = []
for store in range(1, max(trainset.Store)+1):
    test_store = testset[testset.Store==store]
    
    y = models[store-1].predict(test_store[features2])
    pred += y.tolist()

In [51]:
test_pred = testset.copy()
test_pred["Weekly_Sales"] = pred

In [52]:
test_pred.Weekly_Sales

0      768332.8125
1      728432.3125
2      761286.3750
3      791107.1250
4      778400.0000
          ...     
175    791107.1250
176    768332.8125
177    728432.3125
178    761286.3750
179    791107.1250
Name: Weekly_Sales, Length: 180, dtype: float64

In [47]:
sample_submission["Weekly_Sales"] = test_pred.Weekly_Sales
sample_submission.to_csv('baseline_6.csv',index = False)
sample_submission

Unnamed: 0,id,Weekly_Sales
0,1,768332.8125
1,2,728432.3125
2,3,761286.3750
3,4,791107.1250
4,5,778400.0000
...,...,...
175,176,791107.1250
176,177,768332.8125
177,178,728432.3125
178,179,761286.3750
