In [3]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import scipy as sp
import scipy.stats as sps

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

In [6]:
elec = pd.read_csv('../00. Data/elec.csv')
info = pd.read_csv('../00. Data/info.csv')

In [7]:
elec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24672 entries, 0 to 24671
Data columns (total 4 columns):
Date        24672 non-null object
DateHour    24672 non-null object
Minute      24672 non-null object
Value       24672 non-null int64
dtypes: int64(1), object(3)
memory usage: 771.1+ KB


In [8]:
elec

Unnamed: 0,Date,DateHour,Minute,Value
0,2021-01-01 00:15:00,2021-01-01 00:00:00,15분,62
1,2021-01-01 00:30:00,2021-01-01 00:00:00,30분,61
2,2021-01-01 00:45:00,2021-01-01 00:00:00,45분,61
3,2021-01-01 01:00:00,2021-01-01 00:00:00,60분,61
4,2021-01-01 01:15:00,2021-01-01 01:00:00,15분,96
...,...,...,...,...
24667,2021-09-14 23:00:00,2021-09-14 22:00:00,60분,114
24668,2021-09-14 23:15:00,2021-09-14 23:00:00,15분,117
24669,2021-09-14 23:30:00,2021-09-14 23:00:00,30분,119
24670,2021-09-14 23:45:00,2021-09-14 23:00:00,45분,112


In [10]:
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6168 entries, 0 to 6167
Data columns (total 9 columns):
Date        6168 non-null object
생산량         6168 non-null int64
기온          6168 non-null float64
풍속          6165 non-null float64
습도          6168 non-null int64
강수량         6167 non-null float64
전기요금(계절)    6168 non-null float64
공장인원        6151 non-null float64
인건비         6168 non-null float64
dtypes: float64(6), int64(2), object(1)
memory usage: 433.8+ KB


In [13]:
elec.Date = pd.to_datetime( elec.Date )
elec.DateHour = pd.to_datetime( elec.DateHour )
info.Date = pd.to_datetime( info.Date )

# 전처리

## 단계 1

In [19]:
elec1 = elec.pivot_table( index='DateHour', columns='Minute', values='Value' ).reset_index()
elec1

Minute,DateHour,15분,30분,45분,60분
0,2021-01-01 00:00:00,62,61,61,61
1,2021-01-01 01:00:00,96,93,116,113
2,2021-01-01 02:00:00,106,96,106,107
3,2021-01-01 03:00:00,92,110,110,109
4,2021-01-01 04:00:00,108,105,106,108
...,...,...,...,...,...
6163,2021-09-14 19:00:00,152,151,171,139
6164,2021-09-14 20:00:00,124,130,128,130
6165,2021-09-14 21:00:00,134,130,125,124
6166,2021-09-14 22:00:00,100,109,120,114


## 단계 2

In [21]:
elec1['DayName'] = elec1.DateHour.dt.dayofweek
elec1['Hour'] = elec1.DateHour.dt.hour
elec1['AM'] = np.where( elec1.Hour < 12, 0, 1 )
elec1['Weekend_yn'] = elec1['DayName'].apply( lambda x: 1 if x==5 or x==6 else 0 )

holiday_list = ["2021-01-01", "2021-02-11", "2021-02-12", "2021-03-01", "2021-05-05", "2021-05-19", "2021-08-16"]
elec1['Holiday_yn'] = np.where( elec1.DateHour.dt.date.astype(str).isin(holiday_list), 1, 0 )

# loc은 마지막 인덱스에 대해서 닫혀 있습니다.(포함)
elec1['Avg'] = elec1.loc[ :, '15분':'60분' ].mean( axis=1 )

# iloc은 마지막 인덱스에 대해서 열려 있습니다(미포함)
elec1['TotalHour'] = elec1.iloc[ : , 1:5 ].sum( axis=1 )
elec1

Minute,DateHour,15분,30분,45분,60분,DayName,Hour,AM,Weekend_yn,Holiday_yn,Avg,TotalHour
0,2021-01-01 00:00:00,62,61,61,61,4,0,0,0,1,61.25,245
1,2021-01-01 01:00:00,96,93,116,113,4,1,0,0,1,104.50,418
2,2021-01-01 02:00:00,106,96,106,107,4,2,0,0,1,103.75,415
3,2021-01-01 03:00:00,92,110,110,109,4,3,0,0,1,105.25,421
4,2021-01-01 04:00:00,108,105,106,108,4,4,0,0,1,106.75,427
...,...,...,...,...,...,...,...,...,...,...,...,...
6163,2021-09-14 19:00:00,152,151,171,139,1,19,1,0,0,153.25,613
6164,2021-09-14 20:00:00,124,130,128,130,1,20,1,0,0,128.00,512
6165,2021-09-14 21:00:00,134,130,125,124,1,21,1,0,0,128.25,513
6166,2021-09-14 22:00:00,100,109,120,114,1,22,1,0,0,110.75,443


## 단계 3

In [23]:
info1 = info.fillna(0).copy()
info1

Unnamed: 0,Date,생산량,기온,풍속,습도,강수량,전기요금(계절),공장인원,인건비
0,2021-01-01 00:00:00,0,-3.2,2.4,71,0.0,109.8,0.000000,1.5
1,2021-01-01 01:00:00,0,-4.5,1.5,77,0.0,109.8,0.000000,1.5
2,2021-01-01 02:00:00,0,-3.9,2.6,58,0.0,109.8,0.000000,1.5
3,2021-01-01 03:00:00,0,-4.1,2.6,56,0.0,109.8,0.000000,1.5
4,2021-01-01 04:00:00,0,-4.6,2.6,60,0.0,109.8,0.000000,1.5
...,...,...,...,...,...,...,...,...,...
6163,2021-09-14 19:00:00,1497,21.7,3.6,85,9.4,167.2,2.442088,1.5
6164,2021-09-14 20:00:00,45,22.2,4.2,78,9.4,167.2,0.087891,1.5
6165,2021-09-14 21:00:00,149,22.2,4.3,76,9.4,167.2,0.290448,1.5
6166,2021-09-14 22:00:00,66,22.0,2.5,79,9.4,167.2,0.148984,1.5


## 단계 4

In [24]:
basetable1 = pd.merge( left=elec1, right=info1, left_on='DateHour', right_on='Date', how='inner' ).drop( columns=['Date'] )
basetable1

Unnamed: 0,DateHour,15분,30분,45분,60분,DayName,Hour,AM,Weekend_yn,Holiday_yn,Avg,TotalHour,생산량,기온,풍속,습도,강수량,전기요금(계절),공장인원,인건비
0,2021-01-01 00:00:00,62,61,61,61,4,0,0,0,1,61.25,245,0,-3.2,2.4,71,0.0,109.8,0.000000,1.5
1,2021-01-01 01:00:00,96,93,116,113,4,1,0,0,1,104.50,418,0,-4.5,1.5,77,0.0,109.8,0.000000,1.5
2,2021-01-01 02:00:00,106,96,106,107,4,2,0,0,1,103.75,415,0,-3.9,2.6,58,0.0,109.8,0.000000,1.5
3,2021-01-01 03:00:00,92,110,110,109,4,3,0,0,1,105.25,421,0,-4.1,2.6,56,0.0,109.8,0.000000,1.5
4,2021-01-01 04:00:00,108,105,106,108,4,4,0,0,1,106.75,427,0,-4.6,2.6,60,0.0,109.8,0.000000,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6163,2021-09-14 19:00:00,152,151,171,139,1,19,1,0,0,153.25,613,1497,21.7,3.6,85,9.4,167.2,2.442088,1.5
6164,2021-09-14 20:00:00,124,130,128,130,1,20,1,0,0,128.00,512,45,22.2,4.2,78,9.4,167.2,0.087891,1.5
6165,2021-09-14 21:00:00,134,130,125,124,1,21,1,0,0,128.25,513,149,22.2,4.3,76,9.4,167.2,0.290448,1.5
6166,2021-09-14 22:00:00,100,109,120,114,1,22,1,0,0,110.75,443,66,22.0,2.5,79,9.4,167.2,0.148984,1.5


# 문제 1

## 문제 1-1

In [25]:
basetable1.TotalHour.max() - basetable1.TotalHour.min()

830

## 문제 1-2

In [26]:
basetable1.DateHour

0      2021-01-01 00:00:00
1      2021-01-01 01:00:00
2      2021-01-01 02:00:00
3      2021-01-01 03:00:00
4      2021-01-01 04:00:00
               ...        
6163   2021-09-14 19:00:00
6164   2021-09-14 20:00:00
6165   2021-09-14 21:00:00
6166   2021-09-14 22:00:00
6167   2021-09-14 23:00:00
Name: DateHour, Length: 6168, dtype: datetime64[ns]

In [51]:
basetable1.loc[ basetable1.groupby(basetable1.DateHour.dt.date)['TotalHour']
    .transform( lambda x : x == x.max()), 'Hour' ].value_counts()

8     57
11    48
13    31
9     28
4     26
23    16
0     15
14    14
10     9
2      9
16     8
17     5
6      5
21     5
12     4
1      4
7      3
15     2
3      2
22     1
18     1
Name: Hour, dtype: int64

In [28]:
basetable1.groupby( basetable1.DateHour.dt.date )\
    .apply( lambda x : x.loc[ x.TotalHour.idxmax(), 'Hour'] ).value_counts()

8     57
11    42
9     28
13    28
4     25
0     15
14    12
2      9
10     8
16     7
6      5
17     5
1      4
12     3
21     3
3      2
22     1
15     1
7      1
23     1
dtype: int64

## 문제 1-3

In [38]:
( basetable1.groupby( basetable1.DateHour.dt.date )['TotalHour'].max() - \
basetable1.groupby( basetable1.DateHour.dt.date )['TotalHour'].min() ).max()

706

## 문제 1-4

In [42]:
weekday = basetable1[ (basetable1.Weekend_yn == 0) & (basetable1.Holiday_yn == 0) ].copy()

In [53]:
( weekday.groupby( weekday.DateHour.dt.date )['TotalHour'].max() - 
weekday.groupby( weekday.DateHour.dt.date )['TotalHour'].min() ).sort_values( ascending=False )

DateHour
2021-07-19    706
2021-07-26    697
2021-08-09    692
2021-07-12    677
2021-01-21    669
             ... 
2021-08-03     12
2021-03-26     12
2021-08-05     11
2021-08-04     11
2021-08-02     11
Name: TotalHour, Length: 176, dtype: int64

In [52]:
weekday.loc[ weekday.DateHour == '2021-07-19', 'DayName' ]

4776    0
Name: DayName, dtype: int64

## 정답

In [49]:
print( 830, 8, 706, '2021-07-19', '월요일' )

830 8 706 2021-07-19 월요일


# 문제 2

In [55]:
am = weekday.loc[ weekday.AM == 0, 'Avg']
pm = weekday.loc[ weekday.AM == 1, 'Avg']

In [57]:
_, pvalue = sps.bartlett( am, pm )
pvalue

6.451706909079159e-17

In [67]:
ans = sps.ttest_ind( am, pm, equal_var= pvalue > 0.05 ).statistic
ans

-11.659445083355282

In [63]:
round( abs(ans), 3 )

11.659

# 문제 3

In [159]:
train1 = elec.loc[ elec.Date < '2021-09-01', ['Date', 'Value'] ]
test1 = elec.loc[ elec.Date >= '2021-09-01', ['Date', 'Value'] ]

In [160]:
train1['t-2'] = train1.Value.shift(-1)
train1['t-1'] = train1.Value.shift(-2)
train1['t-0'] = train1.Value.shift(-3)

In [161]:
for i in range(-1, -4, -1):
    train1[ f't-{3+i}' ] = train1.Value.shift(i)

In [162]:
for i in range(-1, -4, -1):
    test1[ f't-{3+i}' ] = test1.Value.shift(i)

In [163]:
train1 = train1.rename( columns={'Value':'t-3', 't-0':'target'} )
test1 = test1.rename( columns={'Value':'t-3', 't-0':'target'} )

In [164]:
train1 = train1.dropna()
test1 = test1.dropna()

In [165]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [166]:
x_train = train1.iloc[ :, 1:4 ]
x_test = test1.iloc[ :, 1:4 ]

In [167]:
y_train = train1['target']
y_test = test1['target']

In [168]:
lm1 = LinearRegression().fit( x_train, y_train )

In [169]:
y_hat = lm1.predict( x_test )
mae = mean_absolute_error( y_test, y_hat )
round( mae, 2 )

8.97

# 문제 4

In [170]:
train2 = elec.loc[ elec.Date < '2021-09-01', ['Date', 'Value'] ]
test2 = elec.loc[ elec.Date >= '2021-09-01', ['Date', 'Value'] ]

In [171]:
for i in range(-1, -96, -1):
    train2[ f't-{95+i}' ] = train2.Value.shift(i)
    test2[ f't-{95+i}' ] = test2.Value.shift(i)

In [172]:
train2

Unnamed: 0,Date,Value,t-94,t-93,t-92,t-91,t-90,t-89,t-88,t-87,...,t-9,t-8,t-7,t-6,t-5,t-4,t-3,t-2,t-1,t-0
0,2021-01-01 00:15:00,62,61.0,61.0,61.0,96.0,93.0,116.0,113.0,106.0,...,22.0,27.0,26.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
1,2021-01-01 00:30:00,61,61.0,61.0,96.0,93.0,116.0,113.0,106.0,96.0,...,27.0,26.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,63.0
2,2021-01-01 00:45:00,61,61.0,96.0,93.0,116.0,113.0,106.0,96.0,106.0,...,26.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,63.0,63.0
3,2021-01-01 01:00:00,61,96.0,93.0,116.0,113.0,106.0,96.0,106.0,107.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,63.0,63.0,64.0
4,2021-01-01 01:15:00,96,93.0,116.0,113.0,106.0,96.0,106.0,107.0,92.0,...,22.0,22.0,22.0,22.0,22.0,22.0,63.0,63.0,64.0,63.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23322,2021-08-31 22:45:00,127,112.0,103.0,128.0,118.0,,,,,...,,,,,,,,,,
23323,2021-08-31 23:00:00,112,103.0,128.0,118.0,,,,,,...,,,,,,,,,,
23324,2021-08-31 23:15:00,103,128.0,118.0,,,,,,,...,,,,,,,,,,
23325,2021-08-31 23:30:00,128,118.0,,,,,,,,...,,,,,,,,,,


In [173]:
train2 = train2.rename( columns={'Value':'t-95', 't-0':'target'} )
test2 = test2.rename( columns={'Value':'t-95', 't-0':'target'} )

In [174]:
test2

Unnamed: 0,Date,t-95,t-94,t-93,t-92,t-91,t-90,t-89,t-88,t-87,...,t-9,t-8,t-7,t-6,t-5,t-4,t-3,t-2,t-1,target
23327,2021-09-01 00:00:00,100,83.0,74.0,69.0,68.0,98.0,107.0,117.0,114.0,...,133.0,132.0,126.0,108.0,128.0,119.0,122.0,119.0,121.0,111.0
23328,2021-09-01 00:15:00,83,74.0,69.0,68.0,98.0,107.0,117.0,114.0,121.0,...,132.0,126.0,108.0,128.0,119.0,122.0,119.0,121.0,111.0,103.0
23329,2021-09-01 00:30:00,74,69.0,68.0,98.0,107.0,117.0,114.0,121.0,118.0,...,126.0,108.0,128.0,119.0,122.0,119.0,121.0,111.0,103.0,87.0
23330,2021-09-01 00:45:00,69,68.0,98.0,107.0,117.0,114.0,121.0,118.0,111.0,...,108.0,128.0,119.0,122.0,119.0,121.0,111.0,103.0,87.0,79.0
23331,2021-09-01 01:00:00,68,98.0,107.0,117.0,114.0,121.0,118.0,111.0,105.0,...,128.0,119.0,122.0,119.0,121.0,111.0,103.0,87.0,79.0,75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24667,2021-09-14 23:00:00,114,117.0,119.0,112.0,91.0,,,,,...,,,,,,,,,,
24668,2021-09-14 23:15:00,117,119.0,112.0,91.0,,,,,,...,,,,,,,,,,
24669,2021-09-14 23:30:00,119,112.0,91.0,,,,,,,...,,,,,,,,,,
24670,2021-09-14 23:45:00,112,91.0,,,,,,,,...,,,,,,,,,,


In [175]:
train2 = train2.dropna()
test2 = test2.dropna()

In [176]:
x_train = train2.iloc[ :, 1:96 ]
x_test = test2.iloc[ :, 1:96 ]

y_train = train2['target']
y_test = test2['target']

## 회귀모형 적용 1

In [177]:
lm2 = LinearRegression().fit( x_train, y_train )

In [179]:
y_hat = lm2.predict( x_test )
mae1 = mean_absolute_error( y_test, y_hat )
mae1

7.68969779872557

## PCA 적용

In [180]:
from sklearn.decomposition import PCA

In [181]:
pca = PCA( n_components=0.99, random_state=123 )

In [182]:
pca.fit( x_train )

PCA(copy=True, iterated_power='auto', n_components=0.99, random_state=123,
    svd_solver='auto', tol=0.0, whiten=False)

In [183]:
pca.n_components_

40

In [184]:
tr_x2 = pca.transform( x_train )
te_x2 = pca.transform( x_test )

##  회귀모형 적용 2

In [185]:
lm3 = LinearRegression().fit( tr_x2, y_train )

In [186]:
y_hat = lm3.predict( te_x2 )
mae2 = mean_absolute_error( y_test, y_hat )
mae2

9.343359300942199

In [187]:
print( round(mae1, 2), round(mae2, 2))

7.69 9.34


# 문제 5

## SVM 적용 

In [188]:
from sklearn.svm import SVR

In [189]:
svr = SVR( C=100 ).fit( tr_x2, y_train )

In [190]:
y_hat = svr.predict( te_x2 )
mae = mean_absolute_error( y_test, y_hat )
mae

43.24670670897594

## 전기료 산출

In [191]:
test2.Date = pd.to_datetime( test2.Date.dt.strftime("%Y-%m-%d %H:00:00") )

In [192]:
fee = pd.merge( left=test2, right=info1, on='Date', how='left')
fee['pred'] = y_hat
fee

Unnamed: 0,Date,t-95,t-94,t-93,t-92,t-91,t-90,t-89,t-88,t-87,...,target,생산량,기온,풍속,습도,강수량,전기요금(계절),공장인원,인건비,pred
0,2021-09-01 00:00:00,100,83.0,74.0,69.0,68.0,98.0,107.0,117.0,114.0,...,111.0,178,21.8,1.7,94,0.0,167.2,0.605442,1.5,101.1657
1,2021-09-01 00:00:00,83,74.0,69.0,68.0,98.0,107.0,117.0,114.0,121.0,...,103.0,178,21.8,1.7,94,0.0,167.2,0.605442,1.5,101.1657
2,2021-09-01 00:00:00,74,69.0,68.0,98.0,107.0,117.0,114.0,121.0,118.0,...,87.0,178,21.8,1.7,94,0.0,167.2,0.605442,1.5,101.1657
3,2021-09-01 00:00:00,69,68.0,98.0,107.0,117.0,114.0,121.0,118.0,111.0,...,79.0,178,21.8,1.7,94,0.0,167.2,0.605442,1.5,101.1657
4,2021-09-01 01:00:00,68,98.0,107.0,117.0,114.0,121.0,118.0,111.0,105.0,...,75.0,1160,21.5,1.5,95,0.0,167.2,2.660550,1.5,101.1657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1245,2021-09-13 23:00:00,111,123.0,110.0,87.0,72.0,70.0,70.0,67.0,98.0,...,114.0,345,21.4,1.4,80,0.0,167.2,0.800464,1.5,101.1657
1246,2021-09-13 23:00:00,123,110.0,87.0,72.0,70.0,70.0,67.0,98.0,116.0,...,117.0,345,21.4,1.4,80,0.0,167.2,0.800464,1.5,101.1657
1247,2021-09-13 23:00:00,110,87.0,72.0,70.0,70.0,67.0,98.0,116.0,113.0,...,119.0,345,21.4,1.4,80,0.0,167.2,0.800464,1.5,101.1657
1248,2021-09-14 00:00:00,87,72.0,70.0,70.0,67.0,98.0,116.0,113.0,113.0,...,112.0,128,21.9,1.6,80,0.0,167.2,0.458781,1.5,101.1657


In [193]:
target_fee = fee['target'] * fee['전기요금(계절)']
target_fee

0       18559.2
1       17221.6
2       14546.4
3       13208.8
4       12540.0
         ...   
1245    19060.8
1246    19562.4
1247    19896.8
1248    18726.4
1249    15215.2
Length: 1250, dtype: float64

In [194]:
pred_fee = fee['pred'] * fee['전기요금(계절)']
pred_fee

0       16914.905078
1       16914.905078
2       16914.905078
3       16914.905078
4       16914.905078
            ...     
1245    16914.905078
1246    16914.905078
1247    16914.905078
1248    16914.905078
1249    16914.905078
Length: 1250, dtype: float64

In [195]:
fee_ans = int( (target_fee - pred_fee).sum() )
fee_ans

924410

In [196]:
print( round(mae, 1), fee_ans )

43.2 924410


# 문제 6

In [197]:
basetable1

Unnamed: 0,DateHour,15분,30분,45분,60분,DayName,Hour,AM,Weekend_yn,Holiday_yn,Avg,TotalHour,생산량,기온,풍속,습도,강수량,전기요금(계절),공장인원,인건비
0,2021-01-01 00:00:00,62,61,61,61,4,0,0,0,1,61.25,245,0,-3.2,2.4,71,0.0,109.8,0.000000,1.5
1,2021-01-01 01:00:00,96,93,116,113,4,1,0,0,1,104.50,418,0,-4.5,1.5,77,0.0,109.8,0.000000,1.5
2,2021-01-01 02:00:00,106,96,106,107,4,2,0,0,1,103.75,415,0,-3.9,2.6,58,0.0,109.8,0.000000,1.5
3,2021-01-01 03:00:00,92,110,110,109,4,3,0,0,1,105.25,421,0,-4.1,2.6,56,0.0,109.8,0.000000,1.5
4,2021-01-01 04:00:00,108,105,106,108,4,4,0,0,1,106.75,427,0,-4.6,2.6,60,0.0,109.8,0.000000,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6163,2021-09-14 19:00:00,152,151,171,139,1,19,1,0,0,153.25,613,1497,21.7,3.6,85,9.4,167.2,2.442088,1.5
6164,2021-09-14 20:00:00,124,130,128,130,1,20,1,0,0,128.00,512,45,22.2,4.2,78,9.4,167.2,0.087891,1.5
6165,2021-09-14 21:00:00,134,130,125,124,1,21,1,0,0,128.25,513,149,22.2,4.3,76,9.4,167.2,0.290448,1.5
6166,2021-09-14 22:00:00,100,109,120,114,1,22,1,0,0,110.75,443,66,22.0,2.5,79,9.4,167.2,0.148984,1.5


In [198]:
df = basetable1.loc[ basetable1.생산량 != 0 ]
df

Unnamed: 0,DateHour,15분,30분,45분,60분,DayName,Hour,AM,Weekend_yn,Holiday_yn,Avg,TotalHour,생산량,기온,풍속,습도,강수량,전기요금(계절),공장인원,인건비
80,2021-01-04 08:00:00,154,153,159,159,0,8,0,0,0,156.25,625,165,-2.7,4.1,53,0.0,109.8,0.264000,1.5
81,2021-01-04 09:00:00,158,158,159,166,0,9,0,0,0,160.25,641,313,-1.5,2.4,48,0.0,109.8,0.488300,1.0
82,2021-01-04 10:00:00,127,158,173,168,0,10,0,0,0,156.50,626,2757,0.4,2.4,39,0.0,109.8,4.404153,1.0
83,2021-01-04 11:00:00,141,150,155,149,0,11,0,0,0,148.75,595,983,2.1,2.5,31,0.0,109.8,1.652101,1.0
84,2021-01-04 12:00:00,81,64,94,118,0,12,1,0,0,89.25,357,96,4.0,3.3,30,0.0,109.8,0.268908,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6163,2021-09-14 19:00:00,152,151,171,139,1,19,1,0,0,153.25,613,1497,21.7,3.6,85,9.4,167.2,2.442088,1.5
6164,2021-09-14 20:00:00,124,130,128,130,1,20,1,0,0,128.00,512,45,22.2,4.2,78,9.4,167.2,0.087891,1.5
6165,2021-09-14 21:00:00,134,130,125,124,1,21,1,0,0,128.25,513,149,22.2,4.3,76,9.4,167.2,0.290448,1.5
6166,2021-09-14 22:00:00,100,109,120,114,1,22,1,0,0,110.75,443,66,22.0,2.5,79,9.4,167.2,0.148984,1.5


In [200]:
train3 = df[ df.DateHour < '2021-09-01' ]
test3 = df[ df.DateHour >= '2021-09-01' ]

In [201]:
x_train3 = train3.drop( columns=['생산량', 'DateHour'] )
x_test3 = test3.drop( columns=['생산량', 'DateHour'] )

y_train3 = train3['생산량']
y_test3 = test3['생산량']

print( x_train3.shape, x_test3.shape, y_train3.shape, y_test3.shape )

(3277, 18) (234, 18) (3277,) (234,)


In [202]:
baseModel = LinearRegression().fit(x_train3, y_train3)

In [205]:
y_hat = baseModel.predict( x_train3 )
print( mean_absolute_error(y_train3, y_hat ) )

y_hat = baseModel.predict( x_test3 )
mean_absolute_error( y_test3, y_hat )

291.107621883913


242.450981641306

In [215]:
pd.concat( [ pd.DataFrame( baseModel.coef_, columns=['coef'] ), 
            pd.DataFrame( x_train3.columns, columns=['feature'] ) ], axis=1 )\
            .sort_values( by='coef' )

Unnamed: 0,coef,feature
10,-62171940000000.0,TotalHour
9,-15727400000000.0,Avg
5,-25.95862,Hour
4,-19.76842,DayName
11,-3.223022,기온
14,-0.2235718,강수량
13,-0.2196045,습도
15,2.367676,전기요금(계절)
12,5.535217,풍속
8,13.60444,Holiday_yn


In [216]:
from sklearn.linear_model import ElasticNet

In [248]:
elastic = ElasticNet( l1_ratio=1.0, normalize=True, alpha=0.0001 ).fit( x_train3, y_train3 )

In [249]:
y_hat = elastic.predict( x_test3 )
mean_absolute_error( y_test3, y_hat )

238.66187453469541

In [237]:
pd.concat( [ pd.DataFrame( elastic.coef_, columns=['coef'] ), 
            pd.DataFrame( x_train3.columns, columns=['feature'] ) ], axis=1 )\
            .sort_values( by='coef' )

Unnamed: 0,coef,feature
5,-25.974936,Hour
4,-19.610646,DayName
9,-3.873999,Avg
11,-3.213567,기온
1,-0.267677,30분
13,-0.228143,습도
14,-0.213495,강수량
10,-0.0,TotalHour
3,0.099054,60분
15,2.387617,전기요금(계절)


In [250]:
from xgboost import XGBRegressor

In [251]:
model = XGBRegressor( max_depth=5, n_estimators=3000,  ).fit( x_train3, y_train3 )

In [252]:
y_hat = model.predict( x_train3 )
print( mean_absolute_error(y_train3, y_hat ) )

y_hat = model.predict( x_test3 )
mean_absolute_error( y_test3, y_hat )

0.04616957639170959


12.264791529402773

In [256]:
svr = SVR( C=10000 ).fit( x_train3[['공장인원', 'Avg', 'TotalHour']], y_train3 )

In [257]:
y_hat = svr.predict( x_train3[['공장인원', 'Avg', 'TotalHour']] )
print( mean_absolute_error(y_train3, y_hat ) )

y_hat = svr.predict( x_test3[['공장인원', 'Avg', 'TotalHour']] )
mean_absolute_error( y_test3, y_hat )

0.6043732443698959


70.82578808399498