In [1]:
print('..running')
import os
import torch
from torch.utils.data import DataLoader
import torch.optim.lr_scheduler as lr_scheduler
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from util import cross_entropy_loss_fn
import models
import wandb
from train import evaluation, training 
from data import load_data

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

result_dir = 'models'
if not(os.path.exists(result_dir)):
    os.mkdir(result_dir)
name = 'made' #Change to regularized

batch_size = 64
D = 784   # input dimension
M = 8000  # hidden layer dimensionality
lr = 1e-3 # learning rate
num_epochs = 85
max_patience = 10 # Early Stopping
lam = 0.
n_masks = 1

hyperparameters = {'D': D, 
                   'M': M,
                   'lr': lr,
                   'n_masks': n_masks,
                   'num_epochs': num_epochs,
                   'max_patience': max_patience,
                   'batch_size': batch_size,
                   'lambda': lam,
                    }

run = wandb.init(entity="rajpal906")#entity="rajpal906", project="MADE", name="unregularized", id="1", config=hyperparameters, settings=wandb.Settings(start_method="fork"))
# Training procedure
train_data, val_data, test_data = load_data('mnist', binarize = True)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=os.cpu_count())
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, num_workers=os.cpu_count())
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=os.cpu_count())
model = models.MADE(input_dim=D, hidden_dims=[M], n_masks=n_masks).to(device)
optimizer = torch.optim.Adam([p for p in model.parameters() if p.requires_grad == True], lr = lr)
scheduler = lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.5)
nll_val = training(name=name, result_dir=result_dir, max_patience=max_patience, num_epochs=num_epochs, 
                   model=model, loss_fn=cross_entropy_loss_fn, optimizer=optimizer, scheduler=scheduler, 
                   training_loader=train_loader, val_loader=val_loader, device=device, lam=lam, batch_size = batch_size)
model.eval()
test_val = evaluation(test_loader, cross_entropy_loss_fn, model_best=model)#_best)
wandb.log({"test_loss": test_val * batch_size})
run.log_artifact(result_dir + '/' + name + '.model')
run.finish()

..running


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33ms2592586[0m ([33mrajpal906[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch: 0, train nll=92.279541015625, val nll=96.88436979166667
saved!
Epoch: 1, train nll=92.43114471435547, val nll=92.35324747721354
saved!
Epoch: 2, train nll=89.25775146484375, val nll=90.71231290690105
saved!
Epoch: 3, train nll=95.1533432006836, val nll=89.6805634765625
saved!
Epoch: 4, train nll=88.9970703125, val nll=88.95619685872396
saved!
Epoch: 5, train nll=89.03539276123047, val nll=88.43202758789063
saved!
Epoch: 6, train nll=86.34261322021484, val nll=87.87378849283854
saved!
Epoch: 7, train nll=87.66149139404297, val nll=87.81827270507813
saved!
Epoch: 8, train nll=89.79306030273438, val nll=87.61615120442708
saved!
Epoch: 9, train nll=89.3541259765625, val nll=87.41804622395833
saved!
Epoch: 10, train nll=89.66651153564453, val nll=87.30060815429688
saved!
Epoch: 11, train nll=82.56864166259766, val nll=87.13463956705729
saved!
Epoch: 12, train nll=85.81511688232422, val nll=86.9649921061198
saved!
Epoch: 13, train nll=84.5099105834961, val nll=87.02096459960937
Epoch:

wandb: Network error (ConnectionError), entering retry loop.


Epoch: 24, train nll=86.78034210205078, val nll=86.34636319986978
saved!
Epoch: 25, train nll=87.07500457763672, val nll=86.5243475748698
Epoch: 26, train nll=84.87444305419922, val nll=86.54276928710938
Epoch: 27, train nll=87.57623291015625, val nll=86.36139583333333
Epoch: 28, train nll=83.04195404052734, val nll=86.64837752278646
Epoch: 29, train nll=87.22904205322266, val nll=86.45001831054688
Epoch: 30, train nll=88.13463592529297, val nll=86.54351586914062
Epoch: 31, train nll=81.87834930419922, val nll=86.79252726236979
Epoch: 32, train nll=83.59268951416016, val nll=86.50314689127605
Epoch: 33, train nll=83.1767349243164, val nll=86.46618497721354
Epoch: 34, train nll=87.68682861328125, val nll=86.58639152018229
Epoch: 35, train nll=77.56515502929688, val nll=86.45420670572916
FINAL LOSS: nll=1.3521166007995606


VBox(children=(Label(value='95.741 MB of 95.741 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
test_loss,▁
train_loss,▇▇▆█▆▆▄▅▆▆▆▃▄▄▂▅▆▅▄▆▆▄▄▅▅▅▄▅▃▅▅▃▃▃▅▁
val_loss,█▅▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,35.0
test_loss,86.53546
train_loss,77.56516
val_loss,1.35085
