In [1]:
# ==================================================================================================
# import packages
from pathlib import Path
import torch as th
from torch.utils.data import DataLoader
import time
from tqdm import tqdm # Instantly make your loops show a smart progress meter
import os
import math
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary
from PIL import Image
import uuid
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

# Import other scripts here 
from drought_impact_dataset import *
from drought_impact_sampler import *
# Import a config file for training 
from utils.train_config_rec import * #train_dataset_params, train_sampler_params, test_dataset_params, test_sampler_params, sim_params, model_params
from torch.utils.data import DataLoader
from utils.utils_pixel import *
from model import *

# MLFlow
import mlflow
import mlflow.sklearn
import logging
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

th.manual_seed(1)

  from .autonotebook import tqdm as notebook_tqdm


<torch._C.Generator at 0x154934243a90>

In [2]:
# ==================================================================================================
# Get setup details from configuration file

feature_set = train_dataset_params['feature_set']
remove_bands = train_dataset_params['remove_bands']
multiple_labels = train_dataset_params['multiple_labels']
batch_size_tr =  train_sampler_params['batch_size'] 
n_batch = 2 #sim_params['n_batches']
ts_len =  train_dataset_params['ts_len']
len_preds =  1 #train_dataset_params['len_preds']

n_train = batch_size_tr*n_batch
batch_size_val = 10 #sim_params['batch_size_val'] 
n_batch_val = 2 #sim_params['n_batches_val']
batch_size_te = 10 # sim_params['batch_size_te'] 
n_batch_te = 2 #sim_params['n_batches_te']
exp = sim_params["exp"]
sample_type = sim_params["sample_type"]
method = sim_params["method"] # direct vs oneshot
exp = sim_params["exp"]
exp_val = sim_params["exp_val"]
exp_te = sim_params["exp_test"]

# Model Training

In [3]:
# Create folder where checkpoints for model will be saved

checkpoint_folder = f'checkpoints/{method}_{sim_params["learning_rate"]}_{model_params["num_layers"]}_{model_params["hidden_dim"]}/'
#checkpoint_folder = f'checkpoints/{dt_string}/'
if not os.path.exists(checkpoint_folder):
    os.mkdir(checkpoint_folder)

In [4]:
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y")+f'_{method}_{sim_params["learning_rate"]}_{model_params["num_layers"]}_{model_params["hidden_dim"]}'

checkpoint_file_prefix = 'checkpoint_'+dt_string.split(' ')[0].replace('/', '_')+'_e'
checkpoints = [file for file in os.listdir(checkpoint_folder) if file.startswith(checkpoint_file_prefix) and 'best' not in file]
sorted_checkpoints = sorted(checkpoints, key=get_ckpt_epoch_batch)
# Get last checkpoint (latest epoch)
checkpoint_file = sorted_checkpoints[-1] if len(sorted_checkpoints)!=0 else None

In [5]:
checkpoint_file

In [6]:
if checkpoint_file is not None:
    checkpoint = th.load(checkpoint_folder+checkpoint_file)
    start_epoch = checkpoint['epoch']
    start_batch = checkpoint['batch']
    epoch_loss = checkpoint['epoch_loss']
    optimizer = checkpoint['optimizer']
    dt_string = checkpoint['experiment_name']
    mlflow_run_id = checkpoint['mlflow_run_id']
    
    hidden_dim = model_params["hidden_dim"]
    num_layers = model_params["num_layers"]
    output_dim = model_params["output_dim"]
    
    if method == 'dir': #direct
        model = LSTM_oneshot(input_dim=len(feature_set)-len(remove_bands), hidden_dim=hidden_dim, num_layers=num_layers, output_dim=output_dim)
    if method == 'rec': #recursive
        model = LSTM_recursive(input_dim=len(feature_set)-len(remove_bands), hidden_dim=hidden_dim, num_layers=num_layers, num_steps=sim_params["num_steps"])
    criterion = select_loss_function(sim_params['loss_function'])
    
    model.load_state_dict(checkpoint['state_dict'])
    print("=> loaded checkpoint '{}' (trained for {} epochs)".format(checkpoint_file, checkpoint['epoch']+1))
    
    # Get existing MLflow experiment
    experiment = mlflow.get_experiment_by_name(dt_string)
    
    # Get run that was just created and its ID to use when tracking
    client = mlflow.tracking.MlflowClient() # Create a MlflowClient object
    runs = client.search_runs(experiment.experiment_id)
    mlflow_run_id = [r.info.run_id for r in runs if r.info.run_name==f'train_{method}'][0]
    mlflow_run_id_val = [r.info.run_id for r in runs if r.info.run_name==f'val_{method}'][0]
    

if checkpoint_file is None:
    start_epoch = 0
    start_batch = 0
    epoch_loss = 0
    hidden_dim = model_params["hidden_dim"]
    num_layers = model_params["num_layers"]
    output_dim = model_params["output_dim"]
    lr = sim_params["learning_rate"] # learning rate
    
    if method == 'dir': #direct
        model = LSTM_oneshot(input_dim=len(feature_set)-len(remove_bands), hidden_dim=hidden_dim, num_layers=num_layers, output_dim=output_dim)
    if method == 'rec': #recursive
        model = LSTM_recursive(input_dim=len(feature_set)-len(remove_bands), hidden_dim=hidden_dim, num_layers=num_layers, num_steps=sim_params["num_steps"])
    criterion = select_loss_function(sim_params['loss_function'])
    optimizer = select_optimizer(sim_params["optimizer"], model.parameters(), sim_params["learning_rate"], sim_params["momentum"])

    #summary(model, (len(train_ds.feature_set)-len(remove_bands), 1, 1))
    
    # Create new MLflow experiment
    now = datetime.now()
    dt_string = now.strftime("%d/%m/%Y")+f'_{sim_params["learning_rate"]}_{model_params["num_layers"]}_{model_params["hidden_dim"]}'
    #dt_string = 'debug'
    mlflow.create_experiment(name=dt_string) 
    experiment = mlflow.get_experiment_by_name(dt_string)
    
    with mlflow.start_run(experiment_id = experiment.experiment_id, run_name=f'train_{method}'):
        mlflow.log_param("n_samples training", n_train)
        mlflow.log_param("batch_size training", batch_size_tr)
    
    
    with mlflow.start_run(experiment_id = experiment.experiment_id, run_name=f'val_{method}'):
        mlflow.log_param(f"n_samples val", batch_size_val*n_batch_val)
        
    # Get run that was just created and its ID to use when tracking
    client = mlflow.tracking.MlflowClient() # Create a MlflowClient object
    runs = client.search_runs(experiment.experiment_id)
    mlflow_run_id = [r.info.run_id for r in runs if r.info.run_name==f'train_{method}'][0]
    mlflow_run_id_val = [r.info.run_id for r in runs if r.info.run_name==f'val_{method}'][0]

In [8]:
total_tr_loss = 0
best_loss = np.inf
model.train()

for ix_epoch in tqdm(range(1)): #sim_params["num_epochs"])
    if ix_epoch<start_epoch:
        continue

    print(f"Epoch {ix_epoch}\n---------")

    # Train
    epoch_loss = train_model(method=method, model=model, epoch=ix_epoch, loss_function=criterion, optimizer=optimizer, 
                             batch_size=batch_size_tr, n_batch=n_batch,
                             n_timesteps_in=ts_len, n_timesteps_out=len_preds, n_feats_in=len(feature_set)-len(remove_bands), n_feats_out=output_dim, 
                             remove_band=remove_bands, feature_set=feature_set, 
                             experiment=experiment, checkpoint_folder=checkpoint_folder, dt_string=dt_string, start_batch=start_batch, client=client, run_id=mlflow_run_id, epoch_loss=epoch_loss,
                             sample_type=sample_type, exp=exp, cp_idx=(0,1))

    total_tr_loss += epoch_loss
    

    # Validate
        
    total_val_loss = test_model(method=method, model=model, epoch=ix_epoch, loss_function=criterion, 
                             batch_size=batch_size_val, n_batch=n_batch_val,
                             n_timesteps_in=ts_len, n_timesteps_out=len_preds, n_feats_in=len(feature_set)-len(remove_bands), n_feats_out=output_dim, 
                             remove_band=remove_bands, feature_set=feature_set, 
                             experiment=experiment, split='val', start_batch=start_batch, client=client, run_id=mlflow_run_id, checkpoint_folder=checkpoint_folder,
                             sample_type=sample_type, exp=exp_val, cp_idx=(0,1))

    best_loss = compare_model_for_checkpoint(total_val_loss, best_loss, model, ix_epoch, checkpoint_folder+'checkpoint_'+dt_string.split(' ')[0].replace('/', '_')+f'_e{ix_epoch}_b{n_batch}_best.pth.tar') 

with mlflow.start_run(experiment_id = experiment.experiment_id, run_name='trained'):
    mlflow.sklearn.log_model(model, "model")

  0%|                                                            | 0/1 [00:00<?, ?it/s]

Epoch 0
---------
Batch nbr 0. Average batch loss: 0.007998641580343246
Time for a batch: 0.19682884216308594 sec
Batch nbr 1. Average batch loss: 0.007397109270095825
Time for a batch: 0.15918207168579102 sec
val-ing model...


100%|████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.93it/s]


Batch nbr 0. Average batch loss: 0.04717507362365723
Batch nbr 1. Average batch loss: 0.050290465354919434
=> Saving a new best


In [None]:
 ########################################################################
# TEST MODEL
with mlflow.start_run(experiment_id = experiment.experiment_id, run_name=f'test_{method}'):
        mlflow.log_param(f"n_samples test", batch_size_te*n_batch_te)
        
# Get run that was just created and its ID to use when tracking
client = mlflow.tracking.MlflowClient() # Create a MlflowClient object
runs = client.search_runs(experiment.experiment_id)
mlflow_run_id_test = [r.info.run_id for r in runs if r.info.run_name==f'test_{method}'][0]

total_val_loss = test_model(method=method, model=model, epoch=ix_epoch, loss_function=criterion, 
                             batch_size=batch_size_te, n_batch=n_batch_te,
                             n_timesteps_in=ts_len, n_timesteps_out=len_preds, n_feats_in=len(feature_set)-len(remove_bands), n_feats_out=output_dim, 
                             remove_band=remove_bands, feature_set=feature_set, 
                             experiment=experiment, split='test', start_batch=start_batch, client=client, run_id=mlflow_run_id, checkpoint_folder=checkpoint_folder,
                             sample_type=sample_type, exp=exp_te, cp_idx=(0,1))


In [14]:
mlflow.end_run(mlflow_run_id)
mlflow.end_run(mlflow_run_id_val)
mlflow.end_run(mlflow_run_id_test)

NameError: name 'mlflow_run_id_test' is not defined