### 복합 화력발전소 발전량 예측
- Feature
    - AT (temperature) : 온도
    - V (Exhaust Vacuum) : 배기진공
    - AP (Ambient Pressure) : 주위압력
    - RH (Relative Humidity) : 상대습도
- Target
    - PE (net hourly eletrical energy output) : 시간당 전기 에너지 출력

#### 1. 데이터 로드

In [2]:
import pickle

In [3]:
columns = ["AT", "V", "AP", "RH", "PE"]

with open("datas/thermal_power_plant.plk", "rb") as file:
    datas = pickle.load(file)
    
datas

array([[   8.34,   40.77, 1010.84,   90.01,  480.48],
       [  23.64,   58.49, 1011.4 ,   74.2 ,  445.75],
       [  29.74,   56.9 , 1007.15,   41.91,  438.76],
       ...,
       [  15.99,   43.34, 1014.2 ,   78.66,  465.96],
       [  17.65,   59.87, 1018.58,   94.65,  450.93],
       [  23.68,   51.3 , 1011.86,   71.24,  451.67]])

In [25]:
idx = int(len(datas)*0.3)

In [26]:
train_AT = datas[:-idx,0]
train_V = datas[:-idx,1]
train_AP = datas[:-idx,2]
train_RH = datas[:-idx,3]

train_PE = datas[:-idx,4]


test_AT = datas[-idx:,0]
test_V = datas[-idx:,1]
test_AP = datas[-idx:,2]
test_RH = datas[-idx:,3]

test_PE = datas[-idx:,4]

#### 2. 각 컬럼별 상관계수 구하기

In [28]:
# 상관계수들 // corrcoef함수는 매트릭스 형태로 값을 반환해주기 때문에 [0,1]의 인덱싱을 해준다.
np.corrcoef(train_AT,train_PE)[0,1]
np.corrcoef(train_V,train_PE)[0,1]
np.corrcoef(train_AP,train_PE)[0,1]
np.corrcoef(train_RH,train_PE)[0,1]

0.3879109242581739

In [29]:
# 결정계수들 // 상관계수의 제곱
cor_AT = np.corrcoef(train_AT,train_PE)[0,1]**2
cor_AP = np.corrcoef(train_AP,train_PE)[0,1]**2
cor_V = np.corrcoef(train_V,train_PE)[0,1]**2
cor_RH = np.corrcoef(train_RH,train_PE)[0,1]**2
cor_AT,cor_V,cor_AP,cor_RH

(0.8972298219790403,
 0.7597928458067955,
 0.2632116141898373,
 0.15047488515883073)

#### 3. 각 컬럼별 회귀분석 모델 만들기

In [18]:
from sklearn.linear_model import LinearRegression

In [31]:
model_AT = LinearRegression().fit(train_AT.reshape(-1,1),train_PE)
model_V = LinearRegression().fit(train_V.reshape(-1,1),train_PE)
model_AP = LinearRegression().fit(train_AP.reshape(-1,1),train_PE)
model_RH = LinearRegression().fit(train_RH.reshape(-1,1),train_PE)

In [39]:
## predict에 값이 들어갈때도 기본적으로는 여러 변수(column)의 데이터가 들어간다
## 그렇기 때문에 하나의 변수인 데이터를 넣기때문에 reshape을 해줘야만 함
predict_AT = model_AT.predict(test_AT.reshape(-1,1))
predict_V = model_V.predict(test_V.reshape(-1,1))
predict_AP = model_AP.predict(test_AP.reshape(-1,1))
predict_RH = model_RH.predict(test_RH.reshape(-1,1))

#### 4. 각 컬럼별 모델 평가하기

In [40]:
mae_AT = np.sum(np.absolute(predict_AT - test_PE)) / len(test_PE)
mae_V = np.sum(np.absolute(predict_V - test_PE)) / len(test_PE)
mae_AP = np.sum(np.absolute(predict_AP - test_PE)) / len(test_PE)
mae_RH = np.sum(np.absolute(predict_RH - test_PE)) / len(test_PE)

mae_AT, mae_V, mae_AP, mae_RH

(4.287757845516818, 6.709245406136585, 12.011595029145708, 13.34736775715531)

#### 5. 모든 컬럼을 사용하여 회귀분석 모델 만들어서 평가하기

In [47]:
train_all = datas[:-idx,:-1]
test_all = datas[-idx:,:-1]

train_PE = datas[:-idx,4]
test_PE = datas[-idx:,4]

In [49]:
model_all = LinearRegression().fit(train_all,train_PE)

In [51]:
predict_all = model_all.predict(test_all)

In [52]:
all_mae = np.sum(np.absolute(predict_all - test_PE)) / len(test_PE)

In [53]:
all_mae

3.616714709526299