# 사용 패키지

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
import os

In [9]:
data_path = os.path.abspath('.') + '/data/'

# 데이터 로드

In [10]:
total = pd.read_csv(data_path + '한국가스공사_시간별 공급량_20181231.csv', encoding='cp949')

In [19]:
total.head()

Unnamed: 0,연월일,시간,구분,공급량,year,month,day,weekday
0,2013-01-01,1,0,2497.129,2013,1,1,1
1,2013-01-01,2,0,2363.265,2013,1,1,1
2,2013-01-01,3,0,2258.505,2013,1,1,1
3,2013-01-01,4,0,2243.969,2013,1,1,1
4,2013-01-01,5,0,2344.105,2013,1,1,1


In [20]:
total.shape

(368088, 8)

# 전처리

In [13]:
total['구분'].unique()

array(['A', 'B', 'C', 'D', 'E', 'G', 'H'], dtype=object)

In [14]:
d_map = {}
for i, d in enumerate(total['구분'].unique()):
    d_map[d] = i
total['구분'] = total['구분'].map(d_map)

In [16]:
total['구분'].unique()

array([0, 1, 2, 3, 4, 5, 6], dtype=int64)

In [17]:
total['연월일'] = pd.to_datetime(total['연월일'])

In [18]:
total['year'] = total['연월일'].dt.year
total['month'] = total['연월일'].dt.month
total['day'] = total['연월일'].dt.day
total['weekday'] = total['연월일'].dt.weekday

In [22]:
train_years = [2013,2014,2015,2016,2017]
val_years = [2018]

In [23]:
train = total[total['year'].isin(train_years)]
val = total[total['year'].isin(val_years)]

In [24]:
features = ['구분', 'month', 'day', 'weekday', '시간']
train_x = train[features]
train_y = train['공급량']

val_x = val[features]
val_y = val['공급량']

**========================== 데이터 탐색 ==========================**

In [25]:
train.head()

Unnamed: 0,연월일,시간,구분,공급량,year,month,day,weekday
0,2013-01-01,1,0,2497.129,2013,1,1,1
1,2013-01-01,2,0,2363.265,2013,1,1,1
2,2013-01-01,3,0,2258.505,2013,1,1,1
3,2013-01-01,4,0,2243.969,2013,1,1,1
4,2013-01-01,5,0,2344.105,2013,1,1,1


In [26]:
train.mean()

  """Entry point for launching an IPython kernel.


시간           12.500000
구분            3.000000
공급량         934.864036
year       2015.000548
month         6.523549
day          15.727820
weekday       3.001643
dtype: float64

In [38]:
train['연월일'].value_counts().sort_index()

2013-01-01    168
2013-01-02    168
2013-01-03    168
2013-01-04    168
2013-01-05    168
             ... 
2017-12-27    168
2017-12-28    168
2017-12-29    168
2017-12-30    168
2017-12-31    168
Name: 연월일, Length: 1826, dtype: int64

In [37]:
train['시간'].value_counts().sort_index()

1     12782
2     12782
3     12782
4     12782
5     12782
6     12782
7     12782
8     12782
9     12782
10    12782
11    12782
12    12782
13    12782
14    12782
15    12782
16    12782
17    12782
18    12782
19    12782
20    12782
21    12782
22    12782
23    12782
24    12782
Name: 시간, dtype: int64

In [40]:
train['month'].value_counts().sort_index()

1     26040
2     23688
3     26040
4     25200
5     26040
6     25200
7     26040
8     26040
9     25200
10    26040
11    25200
12    26040
Name: month, dtype: int64

In [41]:
train['구분'].value_counts()

0    43824
1    43824
2    43824
3    43824
4    43824
5    43824
6    43824
Name: 구분, dtype: int64

In [44]:
train.groupby(['day', '구분'])['구분'].count().head(20)

day  구분
1    0     1440
     1     1440
     2     1440
     3     1440
     4     1440
     5     1440
     6     1440
2    0     1440
     1     1440
     2     1440
     3     1440
     4     1440
     5     1440
     6     1440
3    0     1440
     1     1440
     2     1440
     3     1440
     4     1440
     5     1440
Name: 구분, dtype: int64

In [49]:
train.groupby(['day', '구분'])['공급량'].mean().head(21)

day  구분
1    0     1121.703235
     1      923.595769
     2      131.105686
     3      641.312431
     4     1340.766637
     5     1931.711823
     6      241.189189
2    0     1153.970351
     1      946.289130
     2      132.844281
     3      647.864620
     4     1357.675740
     5     1971.996178
     6      243.762145
3    0     1154.472406
     1      938.130430
     2      133.585787
     3      649.033764
     4     1355.624130
     5     1989.205719
     6      243.599644
Name: 공급량, dtype: float64

In [50]:
train.groupby(['month', '구분'])['공급량'].mean().head(21)

month  구분
1      0     2264.479202
       1     1949.568196
       2      211.046387
       3     1314.326480
       4     2810.877849
       5     3488.710961
       6      504.510344
2      0     2038.984674
       1     1750.956749
       2      199.824572
       3     1181.413052
       4     2542.326019
       5     3190.769040
       6      456.252704
3      0     1475.181535
       1     1252.847961
       2      159.513935
       3      875.788733
       4     1828.222452
       5     2548.262901
       6      334.820378
Name: 공급량, dtype: float64

**========================== 데이터 탐색 END ==========================**

# 학습

In [11]:
d_train = lgb.Dataset(train_x, train_y)
d_val = lgb.Dataset(val_x, val_y)

params = {
    'objective': 'regression',
    'metric':'mae',
    'seed':42
}

model = lgb.train(params, d_train, 500, d_val, verbose_eval=20, early_stopping_rounds=10)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84
[LightGBM] [Info] Number of data points in the train set: 306768, number of used features: 5
[LightGBM] [Info] Start training from score 934.864036
Training until validation scores don't improve for 10 rounds
[20]	valid_0's l1: 244.857
[40]	valid_0's l1: 174.855
[60]	valid_0's l1: 158.739
[80]	valid_0's l1: 153.323
[100]	valid_0's l1: 150.948
[120]	valid_0's l1: 150.463
Early stopping, best iteration is:
[112]	valid_0's l1: 150.297


# 추론 및 결과 제출

In [12]:
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [13]:
test.head()

Unnamed: 0,일자|시간|구분
0,2019-01-01 01 A
1,2019-01-01 02 A
2,2019-01-01 03 A
3,2019-01-01 04 A
4,2019-01-01 05 A


In [14]:
submission.head()

Unnamed: 0,일자|시간|구분,공급량
0,2019-01-01 01 A,0
1,2019-01-01 02 A,0
2,2019-01-01 03 A,0
3,2019-01-01 04 A,0
4,2019-01-01 05 A,0


In [15]:
test['일자'] = test['일자|시간|구분'].str.split(' ').str[0]
test['시간'] = test['일자|시간|구분'].str.split(' ').str[1].astype(int)
test['구분'] = test['일자|시간|구분'].str.split(' ').str[2]

In [16]:
test['일자'] = pd.to_datetime(test['일자'])
test['year'] = test['일자'].dt.year
test['month'] = test['일자'].dt.month
test['day'] = test['일자'].dt.day
test['weekday'] = test['일자'].dt.weekday

In [17]:
test['구분'] = test['구분'].map(d_map)

In [18]:
test_x = test[features]

In [19]:
test_x

Unnamed: 0,구분,month,day,weekday,시간
0,0,1,1,1,1
1,0,1,1,1,2
2,0,1,1,1,3
3,0,1,1,1,4
4,0,1,1,1,5
...,...,...,...,...,...
15115,6,3,31,6,20
15116,6,3,31,6,21
15117,6,3,31,6,22
15118,6,3,31,6,23


In [20]:
preds = model.predict(test_x)

In [21]:
submission['공급량'] = preds

In [22]:
submission.to_csv('baseline.csv', index=False)