In [1]:
# Numerical Operations
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# For Progress Bar
from tqdm import tqdm

# Pytorch
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import ray
from ray import tune, air
from src.utils import same_seed
from src.model import train_and_validate, validate
from src.hyper import NUM_WORKERS, const_config, config, scheduler, search_algo, NUM_SAMPLES

same_seed(const_config['random_seed'])

ray.shutdown()
ray.init(num_cpus=3, num_gpus=0)

tuner = tune.Tuner(
    train_and_validate,
    param_space=config,
    tune_config=tune.TuneConfig(
        metric="loss",
        mode="min",
        scheduler=scheduler,
        search_alg=search_algo,
        num_samples=NUM_SAMPLES,
    ),
    run_config=air.RunConfig(
        name = "test",
        local_dir = './checkpoints',
        checkpoint_config = air.CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute='min-loss',
        )
    )
)

result = tuner.fit()

2022-09-20 22:15:06,597	INFO worker.py:1518 -- Started a local Ray instance.
  return ot.distributions.LogUniformDistribution(
[32m[I 2022-09-20 22:15:08,222][0m A new study created in memory with name: optuna[0m


Trial name,status,loc,batch_size,lr,iter,total time (s),loss
train_and_validate_a192eb02,TERMINATED,127.0.0.1:18008,8,1.2748e-05,300,172.252,2.55058
train_and_validate_a3ff2d57,TERMINATED,127.0.0.1:6168,8,0.0074838,300,170.039,
train_and_validate_a401c587,TERMINATED,127.0.0.1:20184,16,3.67669e-05,1,2.57645,48.5318
train_and_validate_a404d2e9,TERMINATED,127.0.0.1:20184,32,2.91681e-05,2,0.442528,38.9246
train_and_validate_a850df41,TERMINATED,127.0.0.1:20184,8,0.0019631,300,168.246,




Result for train_and_validate_a192eb02:
  date: 2022-09-20_22-15-14
  done: false
  experiment_id: e253311100a94eb8acd878ea094b759f
  hostname: DESKTOP-SNQ64UB
  iterations_since_restore: 1
  loss: 36.06066048846525
  node_ip: 127.0.0.1
  pid: 18008
  should_checkpoint: true
  time_since_restore: 2.4642159938812256
  time_this_iter_s: 2.4642159938812256
  time_total_s: 2.4642159938812256
  timestamp: 1663683314
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: a192eb02
  warmup_time: 0.004998207092285156
  
Result for train_and_validate_a401c587:
  date: 2022-09-20_22-15-19
  done: true
  experiment_id: 7f87d711b2a24273aab2b502c0806b8a
  hostname: DESKTOP-SNQ64UB
  iterations_since_restore: 1
  loss: 48.531800494474524
  node_ip: 127.0.0.1
  pid: 20184
  should_checkpoint: true
  time_since_restore: 2.576446056365967
  time_this_iter_s: 2.576446056365967
  time_total_s: 2.576446056365967
  timestamp: 1663683319
  timesteps_since_restore: 0
  training_iteration: 1
  trial



Result for train_and_validate_a3ff2d57:
  date: 2022-09-20_22-18-07
  done: true
  experiment_id: 96b3d791baeb4ce397b5ae957b783ff3
  hostname: DESKTOP-SNQ64UB
  iterations_since_restore: 300
  loss: .nan
  node_ip: 127.0.0.1
  pid: 6168
  should_checkpoint: true
  time_since_restore: 170.03913044929504
  time_this_iter_s: 0.44600367546081543
  time_total_s: 170.03913044929504
  timestamp: 1663683487
  timesteps_since_restore: 0
  training_iteration: 300
  trial_id: a3ff2d57
  warmup_time: 0.005997419357299805
  




Result for train_and_validate_a850df41:
  date: 2022-09-20_22-18-08
  done: true
  experiment_id: 7f87d711b2a24273aab2b502c0806b8a
  hostname: DESKTOP-SNQ64UB
  iterations_since_restore: 300
  loss: .nan
  node_ip: 127.0.0.1
  pid: 20184
  should_checkpoint: true
  time_since_restore: 168.24567770957947
  time_this_iter_s: 0.3820021152496338
  time_total_s: 168.24567770957947
  timestamp: 1663683488
  timesteps_since_restore: 0
  training_iteration: 300
  trial_id: a850df41
  warmup_time: 0.006003856658935547
  


2022-09-20 22:18:08,421	INFO tune.py:758 -- Total run time: 180.20 seconds (180.04 seconds for the tuning loop).


In [3]:
best_result = result.get_best_result("loss", "min", "last")
print("Best trial config: {}".format(best_result.config))
print("Best result error rate", best_result.metrics["loss"])

Best trial config: {'batch_size': 8, 'lr': 1.2747985220298502e-05}
Best result error rate 2.5505837848519577


In [4]:
from src.model import My_Model

best_trained_model = My_Model(117)
best_checkpoint_dir = best_result.checkpoint.to_directory()
model_state, optimizer_state = torch.load(os.path.join(best_checkpoint_dir, 'checkpoint'))
best_trained_model.load_state_dict(model_state)

<All keys matched successfully>

In [5]:
from src.model import train_and_validate
from src.data import load_dataset, get_data_loaders
from src.utils import predict

# Load dataset
dataset = load_dataset('./covid.test.csv')

# Get training and validation data loaders,
# ignore test data loader for now.
_, _, test_loader = get_data_loaders(test_dataset=dataset)

preds = predict(best_trained_model.to('cpu'), test_loader, device='cpu')
print("preds: {}".format(preds))

preds: [ 8.890646   7.7740154  4.792796  ... 36.950333  37.123928  39.654457 ]
