In [1]:
import os
import datetime, time
import multiprocessing
from multiprocessing import cpu_count, Pool
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from pathlib import Path
from textwrap import dedent

import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import joblib # 모델을 저장하고 불러오는 역할
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import TensorDataset, Dataset, DataLoader

from utils import data_loader_v2
from utils import data_loader_v3

submit = True
model_name = "lstm"
# RandomForestClassifier, XGBClassifier, lstm
train_window = 198
epochs = 100
batch_size = 5
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Set Path

In [2]:
train_folder = '../data/train/'
test_folder = '../data/test/'
train_label_path = '../data/train_label.csv'
model_path = '../model/'+model_name+'_model.pkl'
submission_folder = '../submission/'

## Load Files

In [3]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [4]:
class MyDataset(Dataset):
    def __init__(self, files, folder='', train_label=None, event_time=10, nrows=60):
        self.files = files
        self.folder = folder
        self.train_label = train_label
        self.event_time = event_time
        self.nrows = nrows

    def __getitem__(self, idx):
        func_fixed = partial(
            data_loader_v3, 
            folder=self.folder, 
            train_label=self.train_label, 
            event_time=self.event_time, 
            nrows=self.nrows) 
        if __name__ == '__main__':
            pool = Pool(processes=multiprocessing.cpu_count())
            ts_list = list(pool.imap(func_fixed, self.files))
            pool.close()
            pool.join()
        data, label = [], []
        for d in ts_list:
            data.append(d[0].to(device))
            label.append(d[0].to(device))
        data = torch.cat(data)
        if type(label[0]) != type(None):
            label = torch.cat(label)
        print("data.size()", data.size())
        return data, label

    def __len__(self):
        return len(self.files)

In [5]:
trainset = MyDataset(
    files=train_list, 
    folder=train_folder, 
    train_label=train_label, 
    event_time=15, 
    nrows=25)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)

testset = MyDataset(
    files=test_list, 
    folder=test_folder, 
    event_time=10, 
    nrows=60)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=True)

### Model

In [6]:
class LSTM(nn.Module):
    def __init__(self, input_size=44, hidden_layer_size=100, output_size=1):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size

        self.lstm = nn.LSTM(input_size, hidden_layer_size)

        self.linear = nn.Linear(hidden_layer_size, output_size)

        self.hidden_cell = (torch.zeros(1,1,self.hidden_layer_size),
                            torch.zeros(1,1,self.hidden_layer_size))

    def forward(self, input_seq):
        lstm_out, self.hidden_cell = self.lstm(input_seq.view(len(input_seq) ,1, -1), self.hidden_cell)
        predictions = self.linear(lstm_out.view(len(input_seq), -1))
        return predictions[-1]

In [7]:
model = LSTM()
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
model.to(device)
print(model)

LSTM(
  (lstm): LSTM(44, 100)
  (linear): Linear(in_features=100, out_features=1, bias=True)
)


## Train

In [8]:
%%time

now = datetime.datetime.now()

for i in range(epochs):
    for tr, label in trainloader:
        optimizer.zero_grad()
        model.hidden_cell = (torch.zeros(1, 1, model.hidden_layer_size),
                        torch.zeros(1, 1, model.hidden_layer_size))
        
        y_pred = model(tr)

        single_loss = loss_function(y_pred, label)
        single_loss.backward()
        optimizer.step()

    if i%10 == 1:
        print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')
print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')
print("train time:" + str(datetime.datetime.now()-now))

joblib.dump(model, model_path)

data.size() torch.Size([100, 5121])
data.size() torch.Size([100, 5121])
data.size() torch.Size([100, 5121])
data.size() torch.Size([100, 5121])
data.size() torch.Size([100, 5121])


RuntimeError: input.size(-1) must be equal to input_size. Expected 44, got 512100

## Prediction

In [60]:
model = joblib.load(model_path) 
if submit:
    pred = model.predict_proba(test)
    
    
    
else:
    pred = model.predict_proba(X_train)
    accuracy_score(X_train, pred)

FileNotFoundError: [Errno 2] No such file or directory: '../model/lstm_model.pkl'

## Submission

In [None]:
def return_now():
    now = time.localtime()
    return "%04d-%02d-%02d %02d-%02d-%02d" % (now.tm_year, now.tm_mon, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec)

In [None]:
submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('../submission_'+model_name+'_'+return_now()+'.csv', index=True) #제출 파일 만들기