In [1]:
import torch
import utils as ut
import numpy as np
import pandas as pd
import torch.nn as nn
from torch import optim
from time import time
from torch.utils.tensorboard import SummaryWriter
from rumex_model import RumexNet
from trainer import train, validate
from rumex_dataset import RumexDataset, train_loader, test_loader
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

# resnet: 1e-4, 1e-3
# mobilenet: 1e-4, 7e-3
# densenet: 1e-4, 1e-3
# mnasnet: 1e-3, 1e-2
# shufflenet: 5e-3, 5e-2

data_dir = '/u/21/hiremas1/unix/postdoc/rumex/data256_for_training/'
model_name = 'densenet'
base_lr = 1e-4
max_lr = 1e-3
log_dir = 'logs/'+model_name 
bs = 32

dstr = RumexDataset(data_dir+'train/', train_flag=True)
dste = RumexDataset(data_dir+'test1/', train_flag=False)
dstr = torch.utils.data.ConcatDataset([dstr, dste])

dltr = train_loader(dstr, bs)

dsva = RumexDataset(data_dir+'valid/', train_flag=False)
dlva = test_loader(dsva, bs)


In [2]:
model = RumexNet(model_name)
loss_fn = nn.CrossEntropyLoss(reduction="none")
optimizer = torch.optim.Adam(model.parameters(), lr=max_lr)
scheduler = optim.lr_scheduler.CyclicLR(optimizer,
                                        step_size_up=500,
                                        cycle_momentum=False,
                                        base_lr=base_lr,
                                        max_lr=max_lr)
writer = SummaryWriter(log_dir=log_dir)
best_val_loss = np.inf
best_val_acc = 0.5
num_epochs=20
# history= np.zeros((num_epochs, 5))
for ep in np.arange(num_epochs):
    start = time()

    #### fit model ##########
    loss = train(model, dltr, optimizer, scheduler, loss_fn, device)

    ##### Model Validation ##########
    predictions, metrics = validate(model, dlva, loss_fn, device)

    history[ep, 0] = loss # training loss
    history[ep, 1] = metrics["loss"] # validation loss
    history[ep, 2] = metrics["acc"] # validation acc
    history[ep, 3] = metrics["f1"] # validation acc
    history[ep, 4] = metrics["auc"] # validation acc    

    ##### checkpoint saving and logging ##########
    if metrics['loss'] < best_val_loss:
        best_val_loss = metrics['loss']
        ckpt_dict = {'ep': ep,
                     'state_dict': model.state_dict(),
                     'optim_dict': optimizer.state_dict(),
                     'predictions': predictions,
                     'metrics': metrics}    
        ut.save_ckpt(ckpt_dict, log_dir)

    # tensorboad logging
    writer.add_scalar('train/loss', loss, ep)
    for key in metrics.keys():
        name = 'val/'+key
        writer.add_scalar(name, metrics[key], ep)   
    
    
    et = time() - start
    print(f"ep:{ep}|et:{et:.3f}|loss_tr:{loss:.5f}|loss: {metrics['loss']:.5f}" +
          f"|acc:{metrics['acc']:.5f}|re:{metrics['pre']:.5f}" +
          f"|pre:{metrics['recall']:.5f}|f1:{metrics['f1']:.5f}|auc:{metrics['auc']:.5f}")

# np.save(log_dir+"/history.npy", history)

OSError: [Errno 12] Cannot allocate memory

In [None]:
import matplotlib.pyplot as plt
plt.plot(history[:, 0])
plt.plot(history[:, 1])
plt.plot(history[:, 2])