In [1]:
import os

import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn, optim
from torch.nn import functional as F

import json
from mourga_variational.variational_rnn import VariationalRNN
#from temperature_scaling import _ECELoss

from config import Settings; settings = Settings()
from rnn_utils import DiagnosesDataset, split_dataset, MYCOLLATE
import torch.optim as optim

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Load variational model

In [46]:
#hyperparameters
hyperparameters_path = 'data/models/golden-oath-84/hyper_parameters.json'
model_path = 'data/models/golden-oath-84/weights'
with open(hyperparameters_path,'r') as f:
    hyperparams = json.load(f)
    
# weights
weights = torch.load(model_path)

In [47]:
model = VariationalRNN(**hyperparams)
model.load_state_dict(weights)

<All keys matched successfully>

# Load validation split

In [48]:
grouping='ccs'
batch_size=64

dataset_folder = os.path.join(settings.data_base,settings.model_ready_dataset_folder,'diag_only')
dataset = DiagnosesDataset(os.path.join(dataset_folder,'dataset.json'),grouping)
print('dataset at',dataset_folder)

val_dataset = DiagnosesDataset(os.path.join(dataset_folder,'val_subset.json'),grouping)
print(len(val_dataset))
val_dataloader = DataLoader(val_dataset,batch_size=batch_size,collate_fn=MYCOLLATE(dataset)) #batch_size here is arbitrary and doesn't affect total validation speed

dataset at data/model_ready_dataset/diag_only
1125


# Use validation set to set the temperature

## Global Temperature

In [51]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence, pack_sequence
import numpy as np
import abc
class ModelWithGlobalTemperature(nn.Module):
    """
    A thin decorator, which wraps a model with temperature scaling
    model (nn.Module):
        A classification neural network
        NB: Output of the neural network should be the classification logits,
            NOT the softmax (or log softmax)!
    """
    def __init__(self, model):
        super(ModelWithGlobalTemperature, self).__init__()
        self.model = model
        self.temperature = nn.Parameter(torch.ones(1))

    def forward(self, input,**kwargs):
        logits = self.model(input,kwargs)
        return self.temperature_scale(logits)

    def temperature_scale(self, logits):
        """
        Perform temperature scaling on logits
        """
        # Expand temperature to match the size of logits
        temperature = self.temperature.unsqueeze(1).expand(logits.size(0), logits.size(1), logits.size(2))
        return logits / temperature
    
def train_global_T(model, dataloader):
        """
        dataloader: pytorch dataloader
            Should be a validation dataloader
        """
        criterion = nn.BCEWithLogitsLoss(reduction='none')
        optimizer = optim.Adam([model.temperature], lr=1e-1, weight_decay=0)
        #lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.1)
        epochs = 5

        train_losses = []
        model.train()
        for e in range(epochs):
            total_seqs = []
            total_loss = []
            for batch in dataloader:
                optimizer.zero_grad()

                history_sequences, target_sequences = batch['train_sequences'],batch['target_sequences']

                inputs = history_sequences['sequence']
                outs = model(inputs,mc_dropout=True)

                loss = criterion(outs, target_sequences['sequence'])

                # zero-out positions of the loss corresponding to padded inputs
                # if a sequence has all zeros it is considered to be a padding.
                # Comment: safer way to do this would be a solution using the lengths...
                sequences,lengths = pad_packed_sequence(inputs,batch_first=True)
                mask = ~sequences.any(dim=2).unsqueeze(2).repeat(1,1,sequences.shape[-1])

                loss.masked_fill_(mask, 0)

                loss = loss.sum() / (lengths.sum()*sequences.shape[-1])

                loss.backward()
                optimizer.step()

                total_loss.append(loss.item())

            epoch_train_loss = np.mean(total_loss)

            #lr_scheduler.step(epoch_train_loss)

            print("Epoch {:2d}, lr: {:.4f}, loss: {:.4f}, T: {:.4f}"
                  .format(e,
                          optimizer.param_groups[0]['lr'],
                          epoch_train_loss,
                          model.temperature.item()
                          ))
            
            
# define new model with added global temperature scaling
model_global_temperature = ModelWithGlobalTemperature(model)
# train temperature
train_global_T(model_global_temperature,val_dataloader)

Epoch  0, lr: 0.1000, loss: 0.1114, T: 1.0510
Epoch  1, lr: 0.1000, loss: 0.1112, T: 1.0860
Epoch  2, lr: 0.1000, loss: 0.1112, T: 1.0745
Epoch  3, lr: 0.1000, loss: 0.1113, T: 1.0853
Epoch  4, lr: 0.1000, loss: 0.1112, T: 1.0687


## Specific Temperature

In [81]:
a = torch.ones(2,3,3,requires_grad=True)
mask = torch.zeros(2,3,3,requires_grad=False)
mask[0,0,1] = 1
res = a * mask

In [86]:
model_st.temperature.shape[0]

272

In [49]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence, pack_sequence
import numpy as np
import abc

class ModelWithSpecificTemperature(nn.Module):
    """
    A thin decorator, which wraps a model with temperature scaling
    model (nn.Module):
        A classification neural network
        NB: Output of the neural network should be the classification logits,
            NOT the softmax (or log softmax)!
    """
    def __init__(self, model):
        super(ModelWithSpecificTemperature, self).__init__()
        self.model = model
        self.temperature = nn.Parameter(torch.ones(model.n_labels))

    def forward(self, input,**kwargs):
        logits = self.model(input,kwargs)
        return logits / self.temperature

    def temperature_scale(self, logits):
        """
        Perform temperature scaling on logits
        """
        # Expand temperature to match the size of logits
        return logits / temperature
    
    
def train_specific_temperature(model, dataloader, pos):
        """
        dataloader: pytorch dataloader
            Should be a validation dataloader
        """
        inputs = next(iter(dataloader))['train_sequences']['sequence']
        sequences,lengths = pad_packed_sequence(inputs,batch_first=True)
        mask_diagnostic = torch.zeros(sequences.shape)
        mask_diagnostic[:,:,pos] = 1
        mask_diagnostic = mask_diagnostic == 0 # True when value is 0
        del inputs,sequences,lengths
        
        criterion = nn.BCEWithLogitsLoss(reduction='none')
        optimizer = optim.Adam([model.temperature], lr=5e-3)
        #lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.1)
        epochs = 5

        train_losses = []
        model.train()
        for e in range(epochs):
            total_seqs = []
            total_loss = []
            for batch in dataloader:
                optimizer.zero_grad()

                history_sequences, target_sequences = batch['train_sequences'],batch['target_sequences']

                inputs = history_sequences['sequence']
                outs = model(inputs,mc_dropout=True)

                loss = criterion(outs, target_sequences['sequence'])

                # zero-out positions of the loss corresponding to padded inputs
                # if a sequence has all zeros it is considered to be a padding.
                # Comment: safer way to do this would be a solution using the lengths...
                sequences,lengths = pad_packed_sequence(inputs,batch_first=True)
                mask = ~sequences.any(dim=2).unsqueeze(2).repeat(1,1,sequences.shape[-1])

                loss.masked_fill_(mask, 0)
                
                # now mask out all the diagnoses except the one
                mask_diagnostic = torch.zeros(outs.shape)
                mask_diagnostic[:,:,pos] = 1
                mask_diagnostic = mask_diagnostic == 0 # True when value is 0
                loss.masked_fill_(mask_diagnostic,0)

                
                loss = loss.sum() / (lengths.sum()+loss.shape[0])

                loss.backward()
                optimizer.step()

                total_loss.append(loss.item())

            epoch_train_loss = np.mean(total_loss)

            #lr_scheduler.step(epoch_train_loss)

            print("Epoch {:2d}, lr: {:.4f}, loss: {:.4f}, T: {:.4f}"
                  .format(e,
                          optimizer.param_groups[0]['lr'],
                          epoch_train_loss,
                          model.temperature[pos].item()
                          ))
            
            
# define new model with added global temperature scaling
model_st = ModelWithSpecificTemperature(model)
# train temperature
train_specific_temperature(model_st,val_dataloader,88)

Epoch  0, lr: 0.0050, loss: 0.2449, T: 0.9786


KeyboardInterrupt: 

In [39]:
loss.sum() / (lengths.sum()+loss.shape[0])

tensor(0.0334, grad_fn=<DivBackward0>)

In [29]:
lengths

tensor([ 1,  1,  1,  1,  1, 33,  3,  1,  1,  1,  4,  1,  1,  1,  2,  1,  1,  1,
         1,  1,  1,  1,  5,  3,  1,  2,  1,  2,  3,  1,  1,  3,  1,  1,  1,  1,
         1,  1,  1,  1,  2,  1,  2,  2,  1,  1,  1,  1,  4,  1,  3,  1,  1,  1,
         2,  1,  1,  2,  6,  1,  1,  1,  1,  1])

In [11]:
#model_st = ModelWithSpecificTemperature(model)
for batch in val_dataloader:
    inputs = batch['train_sequences']['sequence']
    sequences,lengths = pad_packed_sequence(inputs,batch_first=True)
    a = sequences.any(dim=2).unsqueeze(2).repeat(1,1,sequences.shape[-1])
    break

In [13]:
a.requires_grad

False

In [41]:
model_global_temperature = ModelWithGlobalTemperature(model)

In [43]:
train_global_T(model_global_temperature,val_dataloader)

Epoch  0, lr: 0.1000, loss: 0.1113, T: 1.0640
Epoch  1, lr: 0.1000, loss: 0.1112, T: 1.0830
Epoch  2, lr: 0.1000, loss: 0.1114, T: 1.0791
Epoch  3, lr: 0.1000, loss: 0.1112, T: 1.0853
Epoch  4, lr: 0.1000, loss: 0.1112, T: 1.0812
Epoch  5, lr: 0.1000, loss: 0.1113, T: 1.0858


KeyboardInterrupt: 

In [5]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence, pack_sequence
import numpy as np

def train_temperature_global_T(model, dataloader):
    
    criterion = nn.BCEWithLogitsLoss(reduction='none')
    optimizer = optim.Adam([model.T], lr=1e-1, weight_decay=0)
    #lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.1)
    epochs = 30
    
    train_losses = []
    model.train()
    for e in range(epochs):
        total_seqs = []
        total_loss = []
        for batch in dataloader:
            optimizer.zero_grad()
            
            history_sequences, target_sequences = batch['train_sequences'],batch['target_sequences']

            inputs = history_sequences['sequence']
            outs = model(inputs,mc_dropout=True)

            loss = criterion(outs, target_sequences['sequence'])
            
            # zero-out positions of the loss corresponding to padded inputs
            # if a sequence has all zeros it is considered to be a padding.
            # Comment: safer way to do this would be a solution using the lengths...
            sequences,lengths = pad_packed_sequence(inputs,batch_first=True)
            mask = ~sequences.any(dim=2).unsqueeze(2).repeat(1,1,sequences.shape[-1])
            
            loss.masked_fill_(mask, 0)
        
            loss = loss.sum() / (lengths.sum()*sequences.shape[-1])
            
            loss.backward()
            optimizer.step()

            total_loss.append(loss.item())
            
        epoch_train_loss = np.mean(total_loss)
        
        #lr_scheduler.step(epoch_train_loss)
        
        print("Epoch {:2d}, lr: {:.4f}, loss: {:.4f}, T: {:.4f}"
              .format(e,
                      optimizer.param_groups[0]['lr'],
                      epoch_train_loss,
                      model.T.item()
                      ))

In [7]:
train_temperature_global_T(model,val_dataloader)

Epoch  0, lr: 0.1000, loss: 0.1113, T: 1.0523
Epoch  1, lr: 0.1000, loss: 0.1112, T: 1.0937
Epoch  2, lr: 0.1000, loss: 0.1114, T: 1.0761
Epoch  3, lr: 0.1000, loss: 0.1112, T: 1.0811
Epoch  4, lr: 0.1000, loss: 0.1112, T: 1.0729


KeyboardInterrupt: 

In [None]:
def train_temperature(model,val_dataloader):
    
    T = torch.Varia
    

for batch in val_dataloader:
    inputs = batch['train_sequences']['sequence']
    model(inputs,mc_dropout=True).shape
    break

## Wrapper temperature scaling

In [5]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence, pack_sequence

In [78]:
criterion = torch.nn.BCEWithLogitsLoss(reduction='none')
sequences = None
total_loss = total_n = 0
with torch.no_grad():
    for batch in val_dataloader:

        inputs = batch['train_sequences']['sequence']

        logits = model(inputs)

        loss = criterion(logits, batch['target_sequences']['sequence'])

        # zero-out positions of the loss corresponding to padded inputs
        # if a sequence has all zeros it is considered to be a padding.
        # Comment: safer way to do this would be a solution using the lengths...
        sequences,lengths = pad_packed_sequence(inputs,batch_first=True)
        mask = ~sequences.any(dim=2).unsqueeze(2).repeat(1,1,sequences.shape[-1])
        _ = loss.masked_fill_(mask, 0);

        total_loss += loss.sum() 
        total_n += lengths.sum()*sequences.shape[-1]
        
        
        relevant_positions = [[i+idx*max(lengths) for i in range(e)] for idx,e in enumerate(lengths)]
        relevant_positions = [item for sublist in relevant_positions for item in sublist]
        
        logits_flattened = logits.view(1,-1,logits.size()[2]).squeeze(0)
            
        relevant_logits = logits_flattened[relevant_positions,:]
        relevant_sigmoids = torch.sigmoid(relevant_logits).detach().numpy().squeeze()
        break
        
        
        

bce_loss = (total_loss/total_n).item()
bce_loss


0.10571006685495377

In [79]:
logits_flattened.shape

torch.Size([2112, 272])

In [66]:
logits[2,0,:5]

tensor([-5.8265, -2.2606, -3.2920, -2.2174, -2.3171])

In [25]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence, pack_sequence

In [58]:
logits[0,1,:5]

tensor([-4.1894, -4.0555, -4.1948, -4.1926, -4.1396])

In [55]:
logits_flattened[0,0,:5]

tensor([-6.2151, -2.1821, -2.6066, -3.3682, -8.2770])

In [56]:
relevant_logits[]

tensor([[[-6.2151, -2.1821, -2.6066,  ..., -5.6378, -5.9094, -4.4652],
         [-8.5842, -3.0011, -2.6911,  ..., -7.6248, -5.6858, -4.9581],
         [-6.3826, -2.2121, -3.3051,  ..., -5.5728, -4.4747, -3.1380],
         ...,
         [-7.7056, -3.8565, -3.5821,  ..., -6.5852, -6.4756, -4.4986],
         [-6.9114, -1.7634, -1.7422,  ..., -7.0475, -6.1005, -3.3008],
         [-3.6556, -2.4959, -2.9706,  ..., -8.0930, -6.3574, -4.4456]]])

In [49]:
relevant_logits

tensor([[[-6.2151, -2.1821, -2.6066,  ..., -5.6378, -5.9094, -4.4652],
         [-8.5842, -3.0011, -2.6911,  ..., -7.6248, -5.6858, -4.9581],
         [-6.3826, -2.2121, -3.3051,  ..., -5.5728, -4.4747, -3.1380],
         ...,
         [-7.7056, -3.8565, -3.5821,  ..., -6.5852, -6.4756, -4.4986],
         [-6.9114, -1.7634, -1.7422,  ..., -7.0475, -6.1005, -3.3008],
         [-3.6556, -2.4959, -2.9706,  ..., -8.0930, -6.3574, -4.4456]]])

In [48]:
ya

tensor([[-4.1440, -2.1221, -2.8844,  ..., -7.5836, -4.8592, -3.2502],
        [-5.1509, -0.0974, -1.2883,  ..., -6.9359, -5.3083, -3.4558],
        [ 0.6767, -2.0198, -2.8582,  ..., -7.3179, -5.9209, -3.4635],
        ...,
        [-3.6152, -1.8538, -2.3008,  ..., -9.1104, -5.1172, -3.0493],
        [-3.1273, -1.4662, -1.9435,  ..., -9.0815, -5.3970, -2.5897],
        [-3.4668, -1.6321, -1.9710,  ..., -9.1904, -5.4358, -2.5226]])

In [37]:
ya = pack_padded_sequence(logits,lengths=lengths,batch_first=True,enforce_sorted=False).data

In [108]:
class ModelWithTemperature(nn.Module):
    """
    A thin decorator, which wraps a model with temperature scaling
    model (nn.Module):
        A classification neural network
        NB: Output of the neural network should be the classification logits,
            NOT the softmax (or log softmax)!
    """
    def __init__(self, model):
        super(ModelWithTemperature, self).__init__()
        self.model = model
        self.temperature = nn.Parameter(torch.ones(1))

    def forward(self, input):
        logits = self.model(input)
        return self.temperature_scale(logits)

    def temperature_scale(self, logits):
        """
        Perform temperature scaling on logits
        """
        # Expand temperature to match the size of logits
        temperature = self.temperature.unsqueeze(1).expand(logits.size(0), logits.size(1))
        return logits / temperature

    # This function probably should live outside of this class, but whatever
    def set_temperature(self, valid_loader):
        """
        Tune the tempearature of the model (using the validation set).
        We're going to set it to optimize NLL.
        valid_loader (DataLoader): validation set loader
        """

        bce_criterion = nn.BCEWithLogitsLoss(reduction='none')
        ece_criterion = _ECELoss()

        # First: collect all the logits and labels for the validation set
        before_temperature_bce = list()
        with torch.no_grad():
            for batch in valid_loader:
                
                history_sequences, target_sequences = batch['train_sequences'],batch['target_sequences']
                
                logits = self.model(history_sequences['sequence'])
                return history_sequenceslogits,target_sequences['sequence']
                
                before_temperature_bce.append(bce_criterion(logits, target_sequences['sequence']))
                return before_temperature_bce
        # Calculate NLL and ECE before temperature scaling
                before_temperature_nll = nll_criterion(logits, labels).item()
        before_temperature_ece = ece_criterion(logits, labels).item()
        print('Before temperature - NLL: %.3f, ECE: %.3f' % (before_temperature_nll, before_temperature_ece))

        # Next: optimize the temperature w.r.t. NLL
        optimizer = optim.LBFGS([self.temperature], lr=0.01, max_iter=50)

        def eval():
            optimizer.zero_grad()
            loss = nll_criterion(self.temperature_scale(logits), labels)
            loss.backward()
            return loss
        optimizer.step(eval)

        # Calculate NLL and ECE after temperature scaling
        after_temperature_nll = nll_criterion(self.temperature_scale(logits), labels).item()
        after_temperature_ece = ece_criterion(self.temperature_scale(logits), labels).item()
        print('Optimal temperature: %.3f' % self.temperature.item())
        print('After temperature - NLL: %.3f, ECE: %.3f' % (after_temperature_nll, after_temperature_ece))

        return self

In [139]:
criterion = nn.BCEWithLogitsLoss(reduction='none')
with torch.no_grad():
    for batch in val_dataloader:
        inputs = batch['train_sequences']['sequence']
        targets = batch['target_sequences']['sequence']
        targets[0,15,:].sum()
        out = model(inputs)
        out.shape
        loss = criterion(out,targets)
        loss.shape
        break

tensor(0.)

torch.Size([64, 33, 272])

torch.Size([64, 33, 272])

In [213]:
lengths

tensor([ 1,  1,  1,  1,  1, 33,  3,  1,  1,  1,  4,  1,  1,  1,  2,  1,  1,  1,
         1,  1,  1,  1,  5,  3,  1,  2,  1,  2,  3,  1,  1,  3,  1,  1,  1,  1,
         1,  1,  1,  1,  2,  1,  2,  2,  1,  1,  1,  1,  4,  1,  3,  1,  1,  1,
         2,  1,  1,  2,  6,  1,  1,  1,  1,  1])

In [224]:
sequences.any(dim=2).unsqueeze(2).repeat(1,1,272)

tensor([[[ True,  True,  True,  ...,  True,  True,  True],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]],

        [[ True,  True,  True,  ...,  True,  True,  True],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]],

        [[ True,  True,  True,  ...,  True,  True,  True],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [

In [157]:
sequences,lengths = pad_packed_sequence(inputs,batch_first=True)

In [208]:
mask = ~sequences.any(dim=2).unsqueeze(2).repeat(1,1,272) # True when value corresponds to a padding input
sequences.masked_fill(mask, 0)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [175]:
torch.zeros((len(lengths),lengths.max()))

torch.Size([64, 33])

In [None]:
mask = torch.ones()

In [171]:
lengths

tensor([ 1,  1,  1,  1,  1, 33,  3,  1,  1,  1,  4,  1,  1,  1,  2,  1,  1,  1,
         1,  1,  1,  1,  5,  3,  1,  2,  1,  2,  3,  1,  1,  3,  1,  1,  1,  1,
         1,  1,  1,  1,  2,  1,  2,  2,  1,  1,  1,  1,  4,  1,  3,  1,  1,  1,
         2,  1,  1,  2,  6,  1,  1,  1,  1,  1])

In [170]:
batch['train_sequences'].keys()
batch['train_sequences']['original']

dict_keys(['sequence', 'original', 'pids'])

[[[121,
   153,
   99,
   100,
   129,
   101,
   4,
   106,
   108,
   49,
   29,
   113,
   53,
   54,
   249,
   59,
   157]],
 [[96, 98, 164, 101, 53, 94]],
 [[38, 238, 52, 153, 155, 60, 253, 62, 63]],
 [[96, 98, 163, 7, 106, 205, 117, 55]],
 [[129, 2, 130, 97, 259, 106, 81, 55, 151, 59, 157]],
 [[161, 99, 102, 238, 210, 117, 2616, 153, 156, 157],
  [97, 99, 210, 83, 59],
  [259, 99, 210, 90, 156, 158],
  [99, 210, 90, 59, 156, 158],
  [97, 2, 99, 4, 257, 200, 91, 237, 238, 210, 90, 59, 158],
  [2, 99, 3, 259, 237, 210, 90, 59, 156, 158],
  [2, 99, 3, 109, 210, 83, 156, 158],
  [97, 99, 259, 91, 210, 62, 59, 156, 158],
  [97, 258, 99, 238, 251, 2617, 210, 51, 118, 55, 62, 121, 250, 59, 156, 158],
  [257, 99, 259, 58, 167, 210, 55, 118, 95, 62, 90, 59, 156, 158, 159],
  [97,
   161,
   99,
   58,
   158,
   210,
   244,
   117,
   118,
   2620,
   55,
   2617,
   250,
   59,
   156,
   62,
   159],
  [97, 99, 259, 205, 210, 51, 118, 55, 62, 59, 156, 158, 159],
  [97, 99, 58, 205, 23

In [160]:
sequences.masked_fill_(mask, value)

tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0.,
        0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 

In [146]:
lengths

tensor([ 1,  1,  1,  1,  1, 33,  3,  1,  1,  1,  4,  1,  1,  1,  2,  1,  1,  1,
         1,  1,  1,  1,  5,  3,  1,  2,  1,  2,  3,  1,  1,  3,  1,  1,  1,  1,
         1,  1,  1,  1,  2,  1,  2,  2,  1,  1,  1,  1,  4,  1,  3,  1,  1,  1,
         2,  1,  1,  2,  6,  1,  1,  1,  1,  1])

In [156]:
pad_packed_sequence(inputs,batch_first=True)

(tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         ...,
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 

In [144]:
loss[0,1,:5]
loss[0,2,:5]

tensor([0.0150, 0.0172, 0.0150, 0.0150, 0.0158])

tensor([0.0150, 0.0172, 0.0150, 0.0150, 0.0158])

In [109]:
model_w_temperature = ModelWithTemperature(model)

In [110]:
res1,res2 = model_w_temperature.set_temperature(val_dataloader)

torch.Size([64, 33, 272])

In [104]:
res[0][6,30,:]

tensor([0.0150, 0.0172, 0.0150, 0.0150, 0.0158, 0.0161, 0.0157, 0.0142, 0.0152,
        0.0158, 0.0158, 0.0153, 0.0157, 0.0145, 0.0152, 0.0143, 0.0147, 0.0154,
        0.0145, 0.0153, 0.0159, 0.0155, 0.0146, 0.0146, 0.0143, 0.0149, 0.0143,
        0.0141, 0.0153, 0.0148, 0.0142, 0.0163, 0.0152, 0.0145, 0.0153, 0.0159,
        0.0151, 0.0142, 0.0156, 0.0146, 0.0147, 0.0157, 0.0141, 0.0159, 0.0149,
        0.0141, 0.0149, 0.0162, 0.0157, 0.0161, 0.0160, 0.0162, 0.0156, 0.0142,
        0.0165, 0.0156, 0.0161, 0.0172, 0.0157, 0.0145, 0.0153, 0.0157, 0.0145,
        0.0160, 0.0150, 0.0161, 0.0144, 0.0150, 0.0150, 0.0144, 0.0146, 0.0153,
        0.0147, 0.0144, 0.0147, 0.0153, 0.0146, 0.0143, 0.0161, 0.0142, 0.0145,
        0.0153, 0.0170, 0.0165, 0.0147, 0.0158, 0.0162, 0.0158, 0.0153, 0.0158,
        0.0149, 0.0154, 0.0146, 0.0166, 0.0148, 0.0162, 0.0150, 0.0161, 0.0146,
        0.0153, 0.0154, 0.0149, 0.0161, 0.0152, 0.0153, 0.0148, 0.0145, 0.0145,
        0.0141, 0.0156, 0.0147, 0.0148, 

In [72]:
a = iter(val_dataloader)
batch1 = next(a)
batch2 = next(a)

In [81]:
batch1['train_sequences']['sequence']

PackedSequence(data=tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), batch_sizes=tensor([64, 18, 10,  5,  3,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1]), sorted_indices=tensor([ 5, 58, 22, 48, 10, 23,  6, 50, 31, 28, 43, 25, 27, 14, 54, 57, 42, 40,
        44, 38, 45, 41, 39, 32, 46, 47, 49, 51, 52, 53, 55, 56, 59, 60, 61, 62,
        63, 18,  1,  2,  3,  4,  7,  8,  9, 11, 12, 13, 15, 16, 17, 37, 19, 20,
        21, 24, 26, 29, 30,  0, 33, 34, 35, 36]), unsorted_indices=tensor([59, 38, 39, 40, 41,  0,  6, 42, 43, 44,  4, 45, 46, 47, 13, 48, 49, 50,
        37, 52, 53, 54,  2,  5, 55, 11, 56, 12,  9, 57, 58,  8, 23, 60, 61, 62,
        63, 51, 19, 22, 17, 21, 16, 10, 18, 20, 24, 25,  3, 26,  7, 27, 28, 29,
 

In [12]:
e.keys()

dict_keys(['train_sequences', 'target_sequences', 'train_pids', 'target_pids'])

In [14]:
e['train_sequences'].keys()

dict_keys(['sequence', 'original', 'pids'])

In [45]:
e['train_sequences']['pids'][5]

'109'

In [33]:
logits_list[0][0][:,0]

tensor([-5.8721, -4.1894, -4.1894, -4.1894, -4.1894, -4.1894, -4.1894, -4.1894,
        -4.1894, -4.1894, -4.1894, -4.1894, -4.1894, -4.1894, -4.1894, -4.1894,
        -4.1894, -4.1894, -4.1894, -4.1894, -4.1894, -4.1894, -4.1894, -4.1894,
        -4.1894, -4.1894, -4.1894, -4.1894, -4.1894, -4.1894, -4.1894, -4.1894,
        -4.1894])

In [26]:
print(logits_list[0].shape)
print(logits_list[1].shape)
print(logits_list[2].shape)

torch.Size([64, 33, 272])
torch.Size([64, 11, 272])
torch.Size([64, 7, 272])


In [21]:
logits_list = list()
with torch.no_grad():
    for batch in val_dataloader:
        print('hi')
        input = batch['train_sequences']['sequence']
        logits = model(input)#logits = self.model(input)
        logits_list.append(logits)
        #labels_list.append(label)
    print('im out')
    logits = torch.cat(logits_list)
    #labels = torch.cat(labels_list)

hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
im out


RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 33 but got size 11 for tensor number 1 in the list.

In [8]:
for e in val_dataloader:
    print(e)
    break

{'train_sequences': {'sequence': PackedSequence(data=tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 1.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), batch_sizes=tensor([64, 18, 10,  5,  3,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1]), sorted_indices=tensor([ 5, 58, 22, 48, 10, 23,  6, 50, 31, 28, 43, 25, 27, 14, 54, 57, 42, 40,
        44, 38, 45, 41, 39, 32, 46, 47, 49, 51, 52, 53, 55, 56, 59, 60, 61, 62,
        63, 18,  1,  2,  3,  4,  7,  8,  9, 11, 12, 13, 15, 16, 17, 37, 19, 20,
        21, 24, 26, 29, 30,  0, 33, 34, 35, 36]), unsorted_indices=tensor([59, 38, 39, 40, 41,  0,  6, 42, 43, 44,  4, 45, 46, 47, 13, 48, 49, 50,
        37, 52, 53, 54,  2,  5, 55, 11, 56, 12,  9, 57, 58,  8, 23, 60, 61, 62,
        63, 51, 19, 22, 17, 21, 16, 10, 18, 20, 