### 복합 화력발전소 발전량 예측
- Feature
    - AT (temperature) : 온도
    - V (Exhaust Vacuum) : 배기진공
    - AP (Ambient Pressure) : 주위압력
    - RH (Relative Humidity) : 상대습도
- Target
    - PE (net hourly eletrical energy output) : 시간당 전기 에너지 출력

In [1]:
import pickle

#### 1. 데이터 로드

In [2]:
columns = ["AT", "V", "AP", "RH", "PE"]

with open("datas/thermal_power_plant.plk", "rb") as file:
    datas = pickle.load(file)

datas

array([[   8.34,   40.77, 1010.84,   90.01,  480.48],
       [  23.64,   58.49, 1011.4 ,   74.2 ,  445.75],
       [  29.74,   56.9 , 1007.15,   41.91,  438.76],
       ...,
       [  15.99,   43.34, 1014.2 ,   78.66,  465.96],
       [  17.65,   59.87, 1018.58,   94.65,  450.93],
       [  23.68,   51.3 , 1011.86,   71.24,  451.67]])

#### 2. 각 컬럼별 상관계수 구하기

In [12]:
features = {}
for idx, column in enumerate(columns[:-1]) :
    features[column] = datas[:, idx]
target = datas[:, -1]

In [18]:
corr_df = pd.DataFrame(columns=["feature", "corr", "deter"])

for key, values in features.items():
    corr = np.corrcoef(target, values)[0, 1]
    corr_df.loc[len(corr_df)] = [key, np.round(corr, 2), np.round(corr**2, 2)]
    
corr_df

Unnamed: 0,feature,corr,deter
0,AT,-0.95,0.9
1,V,-0.87,0.76
2,AP,0.52,0.27
3,RH,0.39,0.15


#### 3. 각 컬럼별 회귀분석 모델 만들기

In [19]:
from sklearn.linear_model import LinearRegression

In [23]:
models = {}
for key, value in features.items():
    models[key] = LinearRegression().fit(value.reshape(-1, 1), target)

In [26]:
models.keys(), models["AT"]

(dict_keys(['AT', 'V', 'AP', 'RH']), LinearRegression())

#### 4. 각 컬럼별 모델 평가하기

In [36]:
from sklearn.metrics import mean_absolute_error

In [37]:
score_df = pd.DataFrame(columns=["feature", "score"])

for key, model in models.items():
    
    # 모델의 예측 결과
    pred_y = model.predict(features[key].reshape(-1, 1))
    
    # 예측데이터와 실제데이터를 이용 > MAE
    # mae = np.sum(np.absolute(pred_y - target)) / len(target) 
    mae = mean_absolute_error(target, pred_y)
    mae = np.round(mae, 2)
    
    # MAE 데이터를 데이터 프레임에 추가
    score_df.loc[len(score_df)] = [key, mae]

score_df

Unnamed: 0,feature,score
0,AT,4.29
1,V,6.58
2,AP,11.96
3,RH,13.19


#### 5. 모든 컬럼을 사용하여 회귀분석 모델 만들어서 평가하기
- 데이터셋 분리

In [39]:
rate = 0.3
idx = int(len(datas) * rate)
idx

2870

In [42]:
train_x = datas[:-idx, :-1]
train_y = datas[:-idx, -1]
test_x = datas[-idx:, :-1]
test_y = datas[-idx:, -1]

train_x.shape, train_y.shape, test_x.shape, test_y.shape

((6698, 4), (6698,), (2870, 4), (2870,))

In [43]:
model = LinearRegression().fit(train_x, train_y)

In [46]:
mae = np.round(mean_absolute_error(test_y, model.predict(test_x)), 2)
mae

3.62

#### 결정계수가 가장 높은 AT 빼고 모델 생성

In [51]:
train_x = datas[:-idx, [1, 2, 3]]
train_y = datas[:-idx, -1]
test_x = datas[-idx:, [1, 2, 3]]
test_y = datas[-idx:, -1]

model = LinearRegression().fit(train_x, train_y)
mae = np.round(mean_absolute_error(test_y, model.predict(test_x)), 2)
mae

5.98

#### 결정계수가 가장 작은 RH 빼고 모델 생성

In [52]:
train_x = datas[:-idx, [0, 1, 2]]
train_y = datas[:-idx, -1]
test_x = datas[-idx:, [0, 1, 2]]
test_y = datas[-idx:, -1]

model = LinearRegression().fit(train_x, train_y)
mae = np.round(mean_absolute_error(test_y, model.predict(test_x)), 2)
maess

3.88

In [53]:
# 컬럼별 상관계수 출력

In [56]:
features.keys()

dict_keys(['AT', 'V', 'AP', 'RH'])

In [60]:
np.corrcoef(features["AT"], features["V"])[0, 1],\
np.corrcoef(features["AT"], features["AP"])[0, 1],\
np.corrcoef(features["AT"], features["RH"])[0, 1]

(0.8441067318678673, -0.5075493390955095, -0.5425346521044598)