In [2]:
# load modules and set configurations
import numpy as np
import pandas as pd

import os, copy, random, pickle, gc
from itertools import product
from tqdm import tqdm

pd.set_option('display.max_columns', None)

import torch

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 42

# 3. Auto Encoder

In [2]:
import torch
from torch import nn, optim
import torch.nn.functional as F 
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import DataLoader, TensorDataset

torch.set_default_dtype(torch.float32)

class Autoencoder(nn.Module):
    def __init__(self, input_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            torch.nn.Linear(input_size, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 32),
            torch.nn.ReLU(),
        )

        self.decoder = nn.Sequential(
            torch.nn.Linear(32, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, input_size),
            torch.nn.ReLU()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
with open(f'data-dict-for_ae.pkl', 'rb') as f:
    data_dict = pickle.load(f)

In [3]:
# select data
ver = 1 # 1, 2, 3
low_esi = 1 # 0, 1, 'all'
data = data_dict[ver][low_esi]
n_var = data['trn']['X'].shape[1]

# make it as data loaders
# building data loader
data_loaders = {i:{} for i in ['trn', 'val_tr', 'val_th', 'tst']}
for i in tqdm(['trn', 'val_tr', 'val_th', 'tst']):
    tmp_X = torch.tensor(data[i]['X'])
    tmp_y = torch.tensor(data[i]['y'])
    tmp_ids = torch.tensor(data[i]['ids'])
    tmp_n_seq = torch.tensor(data[i]['n_seq'])
    
    batch_size = 256 # 256, 128, 64
    data_loaders[i] = DataLoader(dataset=TensorDataset(tmp_X, tmp_y, tmp_ids, tmp_n_seq), batch_size=batch_size if i=='trn' else tmp_X.shape[0] if i=='var_tr' else 1, shuffle=False)

100%|██████████| 4/4 [00:00<00:00, 88.75it/s]


In [18]:
# training setting
seed_everything(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

gc.collect()
torch.cuda.empty_cache()

model = Autoencoder(n_var)
model = model.to(device)

n_epochs=5000
factor=0.1
patience=100
min_lr=1e-6

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=factor, patience=patience, min_lr=min_lr, verbose=True)

criterion = nn.MSELoss(reduction='mean').to(device)

history = dict(train=[], val=[])

best_model_wts = copy.deepcopy(model.state_dict())
best_loss = float('inf')
early_stopping_counter = 0

In [20]:
# training
for epoch in range(1, n_epochs+1):
    model = model.train()

    train_losses = []
    for seq_true, _, _, _ in data_loaders['trn']:
        seq_true = seq_true.to(device)
        seq_pred = model(seq_true)
        loss = criterion(seq_pred, seq_true)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

    val_losses = []
    model = model.eval()
    with torch.no_grad():
        for seq_true, _, _, _ in data_loaders['val_tr']:
            seq_true = seq_true.to(device)
            seq_pred = model(seq_true)
            loss = criterion(seq_pred, seq_true)
            val_losses.append(loss.item())

    train_loss = np.mean(train_losses)
    val_loss = np.mean(val_losses)

    history['train'].append(train_loss)
    history['val'].append(val_loss)

    print(f'Epoch {epoch}: train loss {train_loss} val loss {val_loss}')

    scheduler.step(val_loss)  # 검증 손실을 이용하여 학습률 조절

    print("Current learning rate:", optimizer.param_groups[0]['lr'])

    if val_loss < best_loss:
        best_loss = val_loss
        best_model_wts = copy.deepcopy(model.state_dict())
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= patience:
            print(f'Early stopping at epoch {epoch} due to no improvement in validation loss.')
            break
    
    print(f'early_stopping_counter: {early_stopping_counter} ')
    
model.load_state_dict(best_model_wts)
torch.save(model.state_dict(), f'model_best-autoencoder-low_esi{ver}-{low_esi}.pth')

with open(f'model_history-autoencoder-low_esi{ver}-{low_esi}.pkl', 'wb') as f:
    pickle.dump(history, f)

Epoch 1: train loss 0.16069549501354244 val loss 0.15829475888812702
Current learning rate: 0.001
early_stopping_counter: 0 


In [None]:
# evaluation data
gc.collect()
torch.cuda.empty_cache()

seed_everything(SEED)
model = Autoencoder(n_var)
model = model.to(device)
model.load_state_dict(torch.load(f'model_best-autoencoder-low_esi{ver}-{low_esi}.pth'))
model = model.eval()

criterion = nn.MSELoss(reduction='mean').to(device)

# loss calculation
eval_split = 'val_th' #tst val_th
eval_data = []
with torch.no_grad():
    for seq_true, y, id, n_seq in tqdm(data_loaders[eval_split]):
        id = id.cpu().numpy().ravel()[0]
        y = y.cpu().numpy().ravel()[0]
        seq_true = seq_true.to(device)
        seq_pred = model(seq_true)
        loss=criterion(seq_pred, seq_true)
        
        eval_data.append([id, y, loss.item(), n_seq])

eval_data = pd.DataFrame(eval_data, columns=['id', 'true', 'score', 'n_seq'])
eval_data.to_csv(f"eval_data-low_esi{ver}-{low_esi}-autoencoder-{eval_split}.csv", index=False)

In [None]:
# evaluation data
gc.collect()
torch.cuda.empty_cache()

seed_everything(SEED)
model = Autoencoder(n_var)
model = model.to(device)
model.load_state_dict(torch.load(f'model_best-autoencoder-low_esi{ver}-{low_esi}.pth'))
model = model.eval()

criterion = nn.MSELoss(reduction='mean').to(device)

# loss calculation
eval_split = 'tst' #tst val_th
eval_data = []
with torch.no_grad():
    for seq_true, y, id, n_seq in tqdm(data_loaders[eval_split]):
        id = id.cpu().numpy().ravel()[0]
        y = y.cpu().numpy().ravel()[0]
        seq_true = seq_true.to(device)
        seq_pred = model(seq_true)
        loss=criterion(seq_pred, seq_true)
        
        eval_data.append([id, y, loss.item(), n_seq])

eval_data = pd.DataFrame(eval_data, columns=['id', 'true', 'score', 'n_seq'])
eval_data.to_csv(f"eval_data-low_esi{ver}-{low_esi}-autoencoder-{eval_split}.csv", index=False)

In [None]:
def conf_mat(true, pred):
    tp = ((pred == 1) & (true == 1)).sum()
    fp = ((pred == 1) & (true == 0)).sum()
    fn = ((pred == 0) & (true == 1)).sum()
    tn = ((pred == 0) & (true == 0)).sum()
    return tp, fp, fn, tn

eval_split = 'val_th' #tst val_th
eval_result = []
eval_data = pd.read_csv(f"eval_data-low_esi{ver}-{low_esi}-autoencoder-{eval_split}.csv")
scores = eval_data['score'].unique()

for s in tqdm(scores):
    eval_data['pred'] = np.where(eval_data['score']>=s, 1, 0)
    tmp = eval_data.groupby('id').agg({'true': lambda x: x.values[0], 'pred': 'max'}).reset_index()
    tp, fp, fn, tn = conf_mat(tmp['true'], tmp['pred'])

    eval_result.append([s, tp/(tp+fn), tp/(tp+fp), 2*tp/(fp+2*tp+fn)])

eval_result = pd.DataFrame(eval_result, columns=['score', 'rec', 'prec', 'f1'])
eval_result.to_csv(f'eval_result-low_esi{ver}-{low_esi}-autoencoder-{eval_split}.csv', index=False)

100%|██████████| 2652/2652 [00:29<00:00, 90.81it/s]


In [None]:
def conf_mat(true, pred):
    tp = ((pred == 1) & (true == 1)).sum()
    fp = ((pred == 1) & (true == 0)).sum()
    fn = ((pred == 0) & (true == 1)).sum()
    tn = ((pred == 0) & (true == 0)).sum()
    return tp, fp, fn, tn

eval_split = 'tst' #tst val_th
eval_result = []
eval_data = pd.read_csv(f"eval_data-low_esi{ver}-{low_esi}-autoencoder-{eval_split}.csv")
scores = eval_data['score'].unique()

for s in tqdm(scores):
    eval_data['pred'] = np.where(eval_data['score']>=s, 1, 0)
    tmp = eval_data.groupby('id').agg({'true': lambda x: x.values[0], 'pred': 'max'}).reset_index()
    tp, fp, fn, tn = conf_mat(tmp['true'], tmp['pred'])

    eval_result.append([s, tp/(tp+fn), tp/(tp+fp), 2*tp/(fp+2*tp+fn)])

eval_result = pd.DataFrame(eval_result, columns=['score', 'rec', 'prec', 'f1'])
eval_result.to_csv(f'eval_result-low_esi{ver}-{low_esi}-autoencoder-{eval_split}.csv', index=False)