In [1]:
import os
import datetime, time
import multiprocessing
from multiprocessing import cpu_count, Pool
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
import json
import logging
import logging.config
import random

import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import joblib # 모델을 저장하고 불러오는 역할
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import TensorDataset, Dataset, DataLoader
from torch.autograd import Variable

from utils import data_loader_v2
from utils import data_loader_v3
from utils import data_loader_v4
from logger.logger import LoggerAdapter

In [2]:
submit = True
model_name = "lstm" # RandomForestClassifier, XGBClassifier, lstm
epochs = 30
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

with open('./logger/logging.json', 'rt') as f:
    config = json.load(f)
logging.config.dictConfig(config)
logger = logging.getLogger("")
logger = LoggerAdapter("", logger)

# torch.set_default_tensor_type('torch.cuda.FloatTensor')

## Set Path

In [3]:
train_folder = '../data/train_all/'
test_folder = '../data/test/'
train_label_path = '../data/train_label.csv'
model_path = '../model/'+model_name+'_model.pkl'
submission_folder = '../submission/'

## Load Files

In [4]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [5]:
class MyDataset(Dataset):
    def __init__(self, files, files_num=1, folder='', label=None, event_time=10, nrows=60, sclice=1, rand_row=False):
        self.files = files
        self.files_num = files_num
        self.folder = folder
        self.label = label
        self.event_time = event_time
        self.nrows = nrows
        self.sclice = sclice
        self.rand_row = rand_row

    def __getitem__(self, idx):
        idx = np.random.randint(len(self.files)) if idx+self.files_num > len(self.files) else idx
        func_fixed = partial(
            data_loader_v4, 
            folder=self.folder, 
            label=self.label, 
            event_time=self.event_time, 
            nrows=self.nrows,
            sclice=self.sclice,
            rand_row=self.rand_row) 
        if __name__ == '__main__':
            pool = Pool(processes=multiprocessing.cpu_count())
            if type(self.label) is not type(None):
                ts_list = list(pool.imap(func_fixed, self.files[idx:idx+self.files_num]))
            else:
                ts_list = list(pool.imap(func_fixed, self.files))
            pool.close()
            pool.join()
            
            # multiprocessing 안 쓰고 기본으로 배치 가져올때
#         ts_list = data_loader_v4(
#             file_name=self.files[idx],
#             folder=self.folder, 
#             label=self.label, 
#             event_time=self.event_time, 
#             nrows=self.nrows,
#             batch=self.batch)

        data_list, label_list = [], []
        for d in ts_list:
            data_list.append(d[0].to(device)) # .to(device)
            label_list.append(d[1].to(device) if d[1] is not None else [-1,]) # .to(device)
        data_list = torch.cat(data_list)
        label_list = torch.cat(label_list) if label_list[0][0] != -1 else label_list
        return data_list, label_list

    def __len__(self):
        return len(self.files)

In [6]:
trainset = MyDataset(
    files=train_list, 
    files_num=10,
    folder=train_folder, 
    label=train_label, 
    event_time=15, 
    nrows=600,
    sclice=100,
    rand_row=True)
trainloader = DataLoader(trainset, batch_size=1, shuffle=True)

testset = MyDataset(
    files=test_list, 
    folder=test_folder, 
    event_time=10, 
    nrows=60)
testloader = DataLoader(testset, batch_size=1, shuffle=True)

### Model

In [19]:
class LSTM(nn.Module):
    def __init__(self, input_size=5121, hidden_layer_size=100, output_size=198):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size

        self.lstm = nn.LSTM(input_size, hidden_layer_size, bidirectional = True)

        self.linear = nn.Linear(hidden_layer_size*2, output_size)
        
        self.hidden_cell = (Variable(torch.randn(2, 1, hidden_layer_size).to(device)),
                            Variable(torch.randn(2, 1, hidden_layer_size).to(device)))
#         self.hidden_cell = (Variable(torch.randn(output_size, input_size, hidden_layer_size)),
#                             Variable(torch.randn(output_size, input_size, hidden_layer_size)))
    
    def forward(self, x):
#         logger.info(x.size()) # torch.Size([5121])
#         logger.info(x.view(1, 1, -1).size()) # torch.Size([1, 1, 5121])
        self.lstm.flatten_parameters()
        x, self.hidden_cell = self.lstm(x.view(len(x), 1, -1), self.hidden_cell)
#         logger.info(x.size()) # torch.Size([59, 1, 768])
#         logger.info(x.view(len(x), -1).size()) # torch.Size([59, 768])
        predictions = self.linear(x.view(len(x), -1))
#         logger.info(predictions.size()) # torch.Size([1, 198])
#         logger.info(predictions[-1].size()) # torch.Size([198])
        return predictions

    def inithidden(self):
        self.h = Variable(torch.randn(2, 1, 768).to(device))
        self.c = Variable(torch.randn(2, 1, 768).to(device))

In [20]:
model = LSTM()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
loss_function = nn.CrossEntropyLoss().to(device)
model = model.to(device).float()
print(model)

LSTM(
  (lstm): LSTM(5121, 100, bidirectional=True)
  (linear): Linear(in_features=200, out_features=198, bias=True)
)


## Train

In [21]:
%%time

now = datetime.datetime.now()

for i in range(epochs):
    totalloss = 0
    train, label = next(iter(trainloader))
#     logger.info(train.size()) # torch.Size([1, 295, 5121])
#     logger.info(label.size()) # torch.Size([1, 295, 1])
    train = torch.squeeze(train) # torch.Size([295, 5121])
    label = torch.squeeze(label) # torch.Size([295])
    optimizer.zero_grad()
#     model.inithidden()
    pred = model(train)
#     logger.info(pred.size()) # torch.Size([59, 198])
    loss = loss_function(pred, label)
    totalloss += loss
    loss.backward(retain_graph=True)
    optimizer.step()

    print(f'epoch: {i:3} loss: {totalloss:10.8f}')

joblib.dump(model, model_path)

2020-02-12 20:13:23,511 - root - INFO - [] torch.Size([57, 1, 200])
2020-02-12 20:13:23,511 - root - INFO - [] torch.Size([57, 200])
epoch:   0 loss: 5.30244207
2020-02-12 20:13:27,255 - root - INFO - [] torch.Size([60, 1, 200])
2020-02-12 20:13:27,256 - root - INFO - [] torch.Size([60, 200])
epoch:   1 loss: 8.29773140
2020-02-12 20:13:30,852 - root - INFO - [] torch.Size([60, 1, 200])
2020-02-12 20:13:30,852 - root - INFO - [] torch.Size([60, 200])
epoch:   2 loss: 8.08042431
2020-02-12 20:13:34,648 - root - INFO - [] torch.Size([58, 1, 200])
2020-02-12 20:13:34,648 - root - INFO - [] torch.Size([58, 200])
epoch:   3 loss: 9.45738029
2020-02-12 20:13:38,574 - root - INFO - [] torch.Size([58, 1, 200])
2020-02-12 20:13:38,575 - root - INFO - [] torch.Size([58, 200])
epoch:   4 loss: 12.62230873
2020-02-12 20:13:42,440 - root - INFO - [] torch.Size([59, 1, 200])
2020-02-12 20:13:42,441 - root - INFO - [] torch.Size([59, 200])
epoch:   5 loss: 10.94775772
2020-02-12 20:13:46,245 - root -

['../model/lstm_model.pkl']

## Prediction

In [19]:
%%time

model = joblib.load(model_path) 
test, _ = next(iter(testloader))

test = torch.squeeze(test) # torch.Size([295, 5121])
pred_list = []
# for te in test:
optimizer.zero_grad()
pred = model(test)
pred_list.append(pred)
    
pred = torch.cat(pred_list)
_,ans = torch.max(pred,dim=1)

soft = F.softmax(pred)
# logger.info(pred.size())
# logger.info(pred)
# logger.info(ans.size())
# logger.info(ans)

# "%%time
# model = joblib.load(model_path) 

# from sklearn.metrics import classification_report
# correct = 0
# incorrect = 0
# rnn.eval()
# y_test = []
# prediction = []

# for batch in test_iter:
#     txt = batch.text
#     label = batch.label
#     y_test.append(label.data[0])

#     pred = rnn(txt)
#     _,ans = torch.max(pred,dim=1)
#     prediction.append(ans.data[0])
    
#     if ans.data[0] == label.data[0]:
#         correct += 1    
#     else:
#         incorrect += 1
    
# print ('correct : ', correct)
# print ('incorrect : ', incorrect)
# print(classification_report(torch.tensor(y_test), 
#                             torch.tensor(prediction), 
#                             digits=4, 
#                             target_names=['negative', 'positive']))

2020-02-12 18:40:46,002 - root - INFO - [] torch.Size([500, 1, 5121])
2020-02-12 18:40:46,244 - root - INFO - [] torch.Size([500, 198])
Wall time: 3.14 s


  from ipykernel import kernelapp as app


In [20]:
soft = soft.to('cpu')

In [23]:
soft

tensor([[0.0003, 0.0002, 0.0029,  ..., 0.0002, 0.0046, 0.0003],
        [0.0003, 0.0002, 0.0029,  ..., 0.0002, 0.0046, 0.0003],
        [0.0003, 0.0002, 0.0029,  ..., 0.0002, 0.0046, 0.0003],
        ...,
        [0.0003, 0.0002, 0.0029,  ..., 0.0002, 0.0046, 0.0003],
        [0.0003, 0.0002, 0.0029,  ..., 0.0002, 0.0046, 0.0003],
        [0.0003, 0.0002, 0.0029,  ..., 0.0002, 0.0046, 0.0003]],
       grad_fn=<CopyBackwards>)

In [None]:
df = pd.DataFrame(data=pred)
df.to_csv('../test_'+model_name+'.csv', index=True)

## Submission

In [None]:
def return_now():
    now = time.localtime()
    return "%04d-%02d-%02d %02d-%02d-%02d" % (now.tm_year, now.tm_mon, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec)

In [None]:
submission = pd.DataFrame(data=pred)
submission.index = test.index
submission.index.name = 'id'
submission = submission.sort_index()
submission = submission.groupby('id').mean()
submission.to_csv('../submission_'+model_name+'_'+return_now()+'.csv', index=True) #제출 파일 만들기