<a href="https://colab.research.google.com/github/sexymetra/Ai-project/blob/LeeSangHyuck/%ED%81%B4%EB%9F%AC%EC%8A%A4%ED%84%B0%EB%A7%81%20%EB%91%90%EB%B2%88%EC%A7%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### 베이스라인은 참고용 코드 입니다. 여러분 만의 고도화된 모델을 만들어 주세요

## Import

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

#하이퍼 파라미터
CFG = {
    'EPOCHS':10,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':128,
    'SEED':41
}


### 데이터 불러오기

In [None]:
from google.colab import drive
drive.mount('/content/drive')



train_data = pd.read_csv("/content/drive/MyDrive/data/data/train_data.csv")
sample_submission = pd.read_csv("/content/drive/MyDrive/data/data/sample_submission.csv")

Mounted at /content/drive


### 데이터 전처리

In [None]:
#결측치 0으로 만들기,보간
train_data = train_data.interpolate(method = 'linear',limit_direction = 'forward')
train_data = train_data.fillna(0)
# Wspd, Wdir로 클러스터링

from sklearn.cluster import KMeans
cluster = train_data.loc[:,'Wspd':'Wdir']
points = cluster.values
kmeans = KMeans(n_clusters=12).fit(points)
train_data['Cluster'] = kmeans.labels_

#클러스터 더미화
#cluster = pd.get_dummies(train_data['Cluster'],prefix=["cluster"])
#train_data = pd.merge(train_data,cluster,left_index=True, right_index=True)


#Pab1,2,3정리
train_data['Pab'] = train_data.loc[:,'Pab1':'Pab3'].median(axis=1) #Pad1,2,3의 중앙값으로 Pad설정
train_data.drop(train_data.loc[:,'Pab1':'Pab3'],axis=1,inplace = True) #Pad1,2,3 삭제


train_data = train_data.reindex(columns = ['TurbID','Day','Tmstamp','Cluster','Etmp','Itmp','Ndir','Pab','Prtv','Patv']) #원래 순서로 행 정렬
#"['cluster']_0","['cluster']_1","['cluster']_2","['cluster']_3","['cluster']_4","['cluster']_5","['cluster']_6","['cluster']_7","['cluster']_8","['cluster']_9","['cluster']_10","['cluster']_11",

In [None]:
def make_train_data(data):
    train_x, train_y = [], []
    for i in tqdm(sorted(pd.unique(data["TurbID"]))):
        tmp_data = data[data["TurbID"] == i]
        for j in range(1, 201 - 6):
            
            # train data ==> 5일 단위
            # label data ==> 2일 단위
            day_list = [x for x in range(j, j+ 5)]
            label_day_list = [y for y in range(j+5, j + 7)]
            
            train_tmp = tmp_data[tmp_data["Day"].isin(day_list)]
            label_tmp = tmp_data[tmp_data["Day"].isin(label_day_list)]["Patv"]
            
            # feature 선택 및 제거
            train_tmp = train_tmp.drop(["TurbID", "Day"], axis = 1)
            
            train_x.append(np.array(train_tmp))
            train_y.append(np.array(label_tmp))
            
    return train_x, train_y

In [None]:
# Feature 중 Tmstamp 정수로 변환
tms_list = list(pd.unique(train_data["Tmstamp"]))

train_data["Tmstamp"] = train_data["Tmstamp"].apply(lambda x : tms_list.index(x) + 1)

train_x, train_y = make_train_data(train_data)

100%|██████████| 134/134 [00:47<00:00,  2.83it/s]


In [None]:
train_x = np.array(train_x).reshape(-1, train_x[0].shape[0], train_x[0].shape[1])
train_y = np.array(train_y)

In [None]:
train_x.shape, train_y.shape

((25996, 720, 7), (25996, 288))

### Custom Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])
    
    def __len__(self):
        return len(self.X)

In [None]:
train_dataset = CustomDataset(train_x, train_y)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

### 모델 선언

In [None]:
class BaseModel(nn.Module):
    def __init__(self):
        super(BaseModel, self).__init__()
        self.gru = nn.GRU(input_size=7, hidden_size=256, batch_first=True, bidirectional=False)
        self.classifier = nn.Sequential(
            nn.Linear(256,516),
            nn.ReLU(),
            nn.Linear(516,288),
            nn.ReLU()
        )
        
    def forward(self, x):
        hidden, _ = self.gru(x)
        output = self.classifier(hidden[:,-1,:])
        return output

### 모델 학습

In [None]:
def train(model, optimizer, train_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    metric = nn.L1Loss().to(device)#이게 L1Loss였는데 MSE로 바꿈 첫번째 버전 -> 두번째 버전
    best_mae = 9999999
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            with torch.no_grad():
                mae = metric(output, Y)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
            train_mae.append(mae.item())
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Train MAE : [{np.mean(train_mae):.5f}]')
        
        if best_mae > np.mean(train_mae):
            best_mae = np.mean(train_mae)
            torch.save(model.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
            print('Model Saved.')

In [None]:
model = BaseModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
train(model, optimizer, train_loader, device)

100%|██████████| 204/204 [00:16<00:00, 12.45it/s]


Epoch : [1] Train Loss : [197894.37592] Train MAE : [330.32183]
Model Saved.


100%|██████████| 204/204 [00:15<00:00, 12.93it/s]


Epoch : [2] Train Loss : [166940.74931] Train MAE : [314.25334]
Model Saved.


100%|██████████| 204/204 [00:15<00:00, 12.77it/s]


Epoch : [3] Train Loss : [152314.75414] Train MAE : [295.97551]
Model Saved.


100%|██████████| 204/204 [00:16<00:00, 12.49it/s]


Epoch : [4] Train Loss : [140312.04833] Train MAE : [279.39560]
Model Saved.


100%|██████████| 204/204 [00:16<00:00, 12.34it/s]


Epoch : [5] Train Loss : [132562.78776] Train MAE : [269.22666]
Model Saved.


100%|██████████| 204/204 [00:16<00:00, 12.37it/s]


Epoch : [6] Train Loss : [127066.59222] Train MAE : [262.08284]
Model Saved.


100%|██████████| 204/204 [00:16<00:00, 12.38it/s]


Epoch : [7] Train Loss : [122225.73254] Train MAE : [255.38847]
Model Saved.


100%|██████████| 204/204 [00:16<00:00, 12.37it/s]


Epoch : [8] Train Loss : [117259.29488] Train MAE : [248.37426]
Model Saved.


100%|██████████| 204/204 [00:16<00:00, 12.38it/s]


Epoch : [9] Train Loss : [112015.83134] Train MAE : [240.61845]
Model Saved.


100%|██████████| 204/204 [00:16<00:00, 12.37it/s]

Epoch : [10] Train Loss : [108002.71641] Train MAE : [234.20626]
Model Saved.





### 201일, 202일 Patv 추론

In [None]:
test_data_list = [x for x in range(196, 201)]

test_data = train_data[train_data["Day"].isin(test_data_list)]

test_data = test_data.drop(["TurbID", "Day"], axis = 1)

test_data = np.array(test_data).reshape(-1, train_x[0].shape[0], train_x[0].shape[1])

In [None]:
test_data.shape

(134, 720, 7)

In [None]:
test_dataset = CustomDataset(test_data, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def predict(model, test_loader, device):
    model.to(device)
    model.eval()
    preds = []
    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            
            pred = model(X)
            preds += pred.cpu().tolist()
    
    return np.array(preds)

### Model Load

In [None]:
model = BaseModel()
best_checkpoint = torch.load('./best_model.pth')
model.load_state_dict(best_checkpoint)
model.eval()

BaseModel(
  (gru): GRU(7, 256, batch_first=True)
  (classifier): Sequential(
    (0): Linear(in_features=256, out_features=516, bias=True)
    (1): ReLU()
    (2): Linear(in_features=516, out_features=288, bias=True)
    (3): ReLU()
  )
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
preds = predict(model, test_loader, device)
preds = preds.reshape(-1)

100%|██████████| 2/2 [00:00<00:00, 25.87it/s]


### Submit

In [None]:
sample_submission["Patv"] = preds
sample_submission.to_csv("./culster.csv", index = False)

In [None]:
sample_submission["Patv"]

0        195.271637
1        230.793823
2        242.643936
3        256.346039
4        275.673859
            ...    
38587    285.322327
38588    279.712799
38589    278.105377
38590    289.322296
38591    292.971100
Name: Patv, Length: 38592, dtype: float64