<a href="https://colab.research.google.com/github/tinycaterpillar/Ai-project/blob/HwangJinIk/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### 베이스라인은 참고용 코드 입니다. 여러분 만의 고도화된 모델을 만들어 주세요

## Hyperparameter Setting

In [57]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

#하이퍼 파라미터
hyperparam = {
    'interval': 5 # train data의 day 수
}
constant = {
    'day_to_predict': 2, # 예측해야 하는 day의 수
    'total_day' : 200
}

## Colab drive mount

In [58]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 데이터 불러오기

In [59]:
Path = "/content/drive/MyDrive/Colab Notebooks/2022 Inha challenge/data/"
train_data = pd.read_csv(Path+"train_data.csv")
sample_submission = pd.read_csv(Path+"sample_submission.csv")


## 데이터 확인

In [60]:
print("train_data type")
print(type(train_data))
print()
print("train_data shape")
print(train_data.shape)

train_data type
<class 'pandas.core.frame.DataFrame'>

train_data shape
(3859200, 13)


## 변수 의미
1. TurbID - 발전기 ID
2. Day - 날짜
3. Tmstamp - 시간(time stamp)
4. Wspd - 풍속
5. Wdir - 터빈이 바라보는 각도와 실제 바람 방향 각도 차이
6. Etmp - 외부 온도
7. Itmp - 터빈 내부 온도
8. Ndir - 터빈이 바라보는 방향 각도
9. Pab - 터빈 당 3개의 날이 있으며 각각의 각도가 다름
10. Prtv - 무효전력 : 에너지원을 필요로  하지 않는 전력
11. Patv - 유효전력 : 실제로 터빈을 돌리는 일을 하는 전력

In [61]:
train_data.head(10)

Unnamed: 0,TurbID,Day,Tmstamp,Wspd,Wdir,Etmp,Itmp,Ndir,Pab1,Pab2,Pab3,Prtv,Patv
0,1,1,00:00,,,,,,,,,,
1,1,1,00:10,6.17,-3.99,30.73,41.8,25.92,1.0,1.0,1.0,-0.25,494.66
2,1,1,00:20,6.27,-2.18,30.6,41.63,20.91,1.0,1.0,1.0,-0.24,509.76
3,1,1,00:30,6.42,-0.73,30.52,41.52,20.91,1.0,1.0,1.0,-0.26,542.53
4,1,1,00:40,6.25,0.89,30.49,41.38,20.91,1.0,1.0,1.0,-0.23,509.36
5,1,1,00:50,6.1,-1.03,30.47,41.22,20.91,1.0,1.0,1.0,-0.27,482.21
6,1,1,01:00,6.77,1.07,30.31,41.19,20.91,1.0,1.0,1.0,-0.23,584.75
7,1,1,01:10,6.7,-2.8,30.24,41.0,20.91,1.0,1.0,1.0,-0.23,557.98
8,1,1,01:20,6.44,-3.46,30.13,40.91,20.91,1.0,1.0,1.0,-0.21,503.94
9,1,1,01:30,6.25,-3.15,29.97,40.72,20.91,1.0,1.0,1.0,-0.26,463.37


In [62]:
train_data.tail(10)

Unnamed: 0,TurbID,Day,Tmstamp,Wspd,Wdir,Etmp,Itmp,Ndir,Pab1,Pab2,Pab3,Prtv,Patv
3859190,134,200,22:20,4.78,-1.14,9.53,11.13,163.8,0.01,0.01,0.01,-93.81,280.02
3859191,134,200,22:30,4.73,2.49,9.33,10.84,171.85,0.01,0.01,0.01,-100.28,287.8
3859192,134,200,22:40,4.52,-1.1,9.03,10.63,167.25,0.01,0.01,0.01,-85.8,237.5
3859193,134,200,22:50,4.28,2.55,8.93,10.41,164.55,0.01,0.01,0.01,-82.08,228.25
3859194,134,200,23:00,5.13,-0.35,8.86,10.33,171.95,0.01,0.01,0.01,-111.64,318.4
3859195,134,200,23:10,4.86,4.68,9.04,10.4,174.39,0.01,0.01,0.01,-108.13,308.65
3859196,134,200,23:20,4.46,-4.0,8.79,10.33,179.76,0.01,0.01,0.01,-93.64,221.6
3859197,134,200,23:30,4.23,-0.08,8.74,10.22,172.35,0.01,0.01,0.01,-111.2,220.6
3859198,134,200,23:40,4.36,-6.12,8.64,10.1,172.35,0.01,0.01,0.01,-108.04,222.15
3859199,134,200,23:50,4.79,-3.65,8.53,10.01,168.58,0.01,0.01,0.01,-119.76,264.17


## 데이터 전처리
1.   결측값(missing value) 채우기 : https://rfriend.tistory.com/262
2.   기타 코드 : https://wikidocs.net/154050



In [63]:
#결측치 0으로 만들기
train_data = train_data.fillna(0)

# train_data에 Pab1,2,3 세 데이터의 중앙값 데이터 pab 생성
train_data['Pab'] = train_data.loc[:,'Pab1':'Pab3'].median(axis=1)
# pab 1, 2, 3 삭제
train_data.drop(train_data.loc[:,'Pab1':'Pab3'], axis=1, inplace = True)
# column 순서 변경(꼭 필요하지는 않음)
train_data = train_data.reindex(columns = ['TurbID','Day','Tmstamp','Wspd','Wdir','Etmp','Itmp','Ndir','Pab','Prtv','Patv'])
train_data.head(10)

Unnamed: 0,TurbID,Day,Tmstamp,Wspd,Wdir,Etmp,Itmp,Ndir,Pab,Prtv,Patv
0,1,1,00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,00:10,6.17,-3.99,30.73,41.8,25.92,1.0,-0.25,494.66
2,1,1,00:20,6.27,-2.18,30.6,41.63,20.91,1.0,-0.24,509.76
3,1,1,00:30,6.42,-0.73,30.52,41.52,20.91,1.0,-0.26,542.53
4,1,1,00:40,6.25,0.89,30.49,41.38,20.91,1.0,-0.23,509.36
5,1,1,00:50,6.1,-1.03,30.47,41.22,20.91,1.0,-0.27,482.21
6,1,1,01:00,6.77,1.07,30.31,41.19,20.91,1.0,-0.23,584.75
7,1,1,01:10,6.7,-2.8,30.24,41.0,20.91,1.0,-0.23,557.98
8,1,1,01:20,6.44,-3.46,30.13,40.91,20.91,1.0,-0.21,503.94
9,1,1,01:30,6.25,-3.15,29.97,40.72,20.91,1.0,-0.26,463.37


## make_train_data
모델이 넣을 'input data' 와 '정답 data' 를 나눔  
'input data'는 5일간의 데이터 값(말하자면 1,2,3,4,5 일의 데이터)  
'정답 data'는 5일 이후 2일간의 target data 값(말하자면 6, 7일의 target data)  
이때 5일은 hyperparameter, 2일은 상수(최종적으로 예측하고 싶은 날짜의 수로 고정)  
TurbID 는 구분하지 않고 모든 터보에 대해 적용함 -> 터보를 구분해서 모델을 따로따로 만들자는 상혁이의 의견이 있었음

In [64]:
def make_train_data(data):
    global constant, hyperparam
    train_x, train_y = [], []
    for turb_id in tqdm(sorted(pd.unique(data["TurbID"]))):
        tmp_data = data[data["TurbID"] == turb_id]
        # day 데이터가 1 ~ 200일 까지 있음
        for j in range(1, constant['total_day'] - hyperparam['interval'] - constant['day_to_predict'] + 2):
            # train data ==> 5일 단위
            # label data ==> 2일 단위
            day_list = [x for x in range(j, j+hyperparam['interval'])]
            label_day_list = [y for y in range(j+hyperparam['interval'], j + hyperparam['interval']+constant['day_to_predict'])]
            
            train_tmp = tmp_data[tmp_data["Day"].isin(day_list)]
            label_tmp = tmp_data[tmp_data["Day"].isin(label_day_list)]["Patv"]
            
            # feature 제거
            train_tmp = train_tmp.drop(["TurbID", "Day", "Tmstamp"], axis = 1)
            
            train_x.append(np.array(train_tmp))
            train_y.append(np.array(label_tmp))
            
    return train_x, train_y

In [65]:
train_x, train_y = make_train_data(train_data)

100%|██████████| 134/134 [00:42<00:00,  3.13it/s]


In [66]:
print(train_x[0].shape)
print(train_y[0].shape)

(720, 8)
(288,)


In [67]:
# train_x, train_y 를 np.array 로 바꿈
train_x = np.array(train_x).reshape(-1, train_x[0].shape[0], train_x[0].shape[1])
train_y = np.array(train_y)

# Validation set 나누기

In [68]:
from sklearn.model_selection import train_test_split

# train_data size : validation_data size = 4 : 1, random_state(seed) = 100
train_data, validation_data, train_labels, validation_labels = train_test_split(train_x, train_y, test_size=0.2, random_state=100)

In [69]:
print(train_data.shape)
print(train_labels.shape)
print(validation_data.shape)
print(validation_labels.shape)

(20796, 720, 8)
(20796, 288)
(5200, 720, 8)
(5200, 288)


# data 저장

In [70]:
np.save(Path+"train_data", train_data)
np.save(Path+"train_labels", train_labels)
np.save(Path+"validation_data", validation_data)
np.save(Path+"validation_labels", validation_labels)