In [1]:
import os
import datetime, time
import multiprocessing
from multiprocessing import cpu_count, Pool
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
import json
import logging
import logging.config
import random

import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import joblib # 모델을 저장하고 불러오는 역할
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import TensorDataset, Dataset, DataLoader

from utils import data_loader_v2
from utils import data_loader_v3
from utils import data_loader_v4
from logger.logger import LoggerAdapter

In [2]:
submit = True
model_name = "lstm" # RandomForestClassifier, XGBClassifier, lstm
train_window = 198
epochs = 100
batch_size = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

with open('./logger/logging.json', 'rt') as f:
    config = json.load(f)
logging.config.dictConfig(config)
logger = logging.getLogger("")
logger = LoggerAdapter("default", logger)

torch.set_default_tensor_type('torch.cuda.FloatTensor')

## Set Path

In [3]:
train_folder = '../data/train_all/'
test_folder = '../data/test_all/'
train_label_path = '../data/train_label.csv'
model_path = '../model/'+model_name+'_model.pkl'
submission_folder = '../submission/'

## Load Files

In [4]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [5]:
class MyDataset(Dataset):
    def __init__(self, files, folder='', train_label=None, event_time=10, nrows=60, batch=1):
        self.files = files
        self.folder = folder
        self.train_label = train_label
        self.event_time = event_time
        self.nrows = nrows
        self.batch = batch

    def __getitem__(self, idx):
        idx = random.randint(0,len(self.files)-1) if idx+batch_size > len(self.files) else idx
        getitem_logger = LoggerAdapter("__getitem__", logger)
        getitem_logger.info("Start")
        func_fixed = partial(
            data_loader_v4, 
            folder=self.folder, 
            train_label=self.train_label, 
            event_time=self.event_time, 
            nrows=self.nrows,
            batch=self.batch) 
        if __name__ == '__main__':
            pool = Pool(processes=multiprocessing.cpu_count())
            ts_list = list(pool.imap(func_fixed, self.files[idx:idx+batch_size]))
            pool.close()
            pool.join()
#         ts_list = data_loader_v4(
#             file_name=self.files[idx],
#             folder=self.folder, 
#             train_label=self.train_label, 
#             event_time=self.event_time, 
#             nrows=self.nrows,
#             batch=self.batch)
        getitem_logger.info("Data load comp")
        
#         ts_list[0] = ts_list[0].to(device)
#         ts_list[1] = ts_list[1].to(device)
#         getitem_logger.info("Move data to gpu comp")

        data = torch.cat(data)
        getitem_logger.info(data.size())
        label = torch.cat(label) if type(label[0]) != type(None) else None
        getitem_logger.info("Tensor concat comp")
        return ts_list[0], ts_list[1]

    def __len__(self):
        return len(self.files)

In [6]:
trainset = MyDataset(
    files=train_list, 
    folder=train_folder, 
    train_label=train_label, 
    event_time=15, 
    nrows=600,
    batch=10)
trainloader = DataLoader(trainset, batch_size=1, shuffle=True)

testset = MyDataset(
    files=test_list, 
    folder=test_folder, 
    event_time=10, 
    nrows=60)
testloader = DataLoader(testset, batch_size=1, shuffle=True)

### Model

In [7]:
class LSTM(nn.Module):
    def __init__(self, input_size=5121, hidden_layer_size=768, output_size=198):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size

        self.lstm = nn.LSTM(input_size, hidden_layer_size)

        self.linear = nn.Linear(hidden_layer_size, output_size)

        self.hidden_cell = (torch.zeros(1,1,self.hidden_layer_size),
                            torch.zeros(1,1,self.hidden_layer_size))

    def forward(self, input_seq):
        lstm_out, self.hidden_cell = self.lstm(input_seq.view(len(input_seq) ,1, -1), self.hidden_cell)
        predictions = self.linear(lstm_out.view(len(input_seq), -1))
        return predictions[-1]

In [8]:
model = LSTM()
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# model.to(device)
print(model)

LSTM(
  (lstm): LSTM(5121, 768)
  (linear): Linear(in_features=768, out_features=198, bias=True)
)


## Train

In [9]:
%%time

now = datetime.datetime.now()

for i in range(epochs):
    cnt = 0
    for tr, label in trainloader:
#         tr = torch.cat([t for t in tr])
#         label = torch.cat([l for l in label])
        tr = tr.squeeze()
        label = label.squeeze()
        logger.info(tr.size())
        
        optimizer.zero_grad()
#         model.hidden_cell = (torch.zeros(1, 1, model.hidden_layer_size, device=device),
#                              torch.zeros(1, 1, model.hidden_layer_size, device=device))
        model.hidden_cell = (torch.zeros(1, 1, model.hidden_layer_size),
                             torch.zeros(1, 1, model.hidden_layer_size))
        
        y_pred = model(tr)

        single_loss = loss_function(y_pred, label)
        single_loss.backward()
        optimizer.step()
        
        logger.info(str(cnt)+" trainloader end")
        cnt += 1

    if i%10 == 1:
        print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')
print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')
print("train time:" + str(datetime.datetime.now()-now))

joblib.dump(model, model_path)

2020-02-11 16:00:08,433 - root - INFO - [default] [__getitem__] Start
2020-02-11 16:00:10,851 - root - INFO - [default] [__getitem__] Data load comp


UnboundLocalError: local variable 'data' referenced before assignment

## Prediction

In [60]:
model = joblib.load(model_path) 
if submit:
    pred = model.predict_proba(test)
    
    
    
else:
    pred = model.predict_proba(X_train)
    accuracy_score(X_train, pred)

FileNotFoundError: [Errno 2] No such file or directory: '../model/lstm_model.pkl'

## Submission

In [None]:
def return_now():
    now = time.localtime()
    return "%04d-%02d-%02d %02d-%02d-%02d" % (now.tm_year, now.tm_mon, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec)

In [None]:
submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('../submission_'+model_name+'_'+return_now()+'.csv', index=True) #제출 파일 만들기