We will need to import some helper code, so we need to run this

In [1]:
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [14]:
import torch
from torch import nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  #  torch.device('cpu') # 
from collections import OrderedDict
import numpy as np
from torch import autograd

# Encoders

For us an encoder takes a sequence of input vectors $\mathbf s_1^n$, each $I$-dimensional, and produces a sequence of output vectors $\mathbf t_1^n$, each $O$-dimensional and a summary vector $\mathbf h \in \mathbb R^O$:

\begin{equation}
    \mathbf t_1^n, \mathbf h = \text{encoder}(\mathbf s_1^n)
\end{equation}

In practice for a correct batched implementation, our encoders also take a mask matrix and a vector of lengths.

In [3]:
class Encoder(nn.Module):
    """
    For you to focus on DGMs and abstract away from certain architecture details, 
     we will be providing some helper classes.
     
    An encoder is one of them.
    """
    
    
    def __init__(self):
        super(Encoder, self).__init__()
        
    def forward(self, inputs, mask, lengths):
        """
        The inputs are batch-first tensors
        
        :param inputs: [B, T, d]
        :param mask: [B, T]
        :param lengths: [B]
        :returns: [B, T, d], [B, d]
            where the first tensor is the transformed input
            and the second tensor is a summary of all inputs
        """
        pass
        

In [4]:
class Passthrough(Encoder):
    """
    This encoder does not do anything, it simply passes the input forward and summarises 
        them via a sum.
    """
    
    def __init__(self):
        super(Passthrough, self).__init__()
        
    def forward(self, inputs, mask, lengths, **kwargs):
        # inputs: [B, T, d]
        # mask: [B, T]
        # lengths: [B]
        
        # [B, T, d], [B, d]
        return inputs, (inputs * mask.unsqueeze(-1).float()).sum(dim=1) 

    
class FFEncoder(Encoder):
    """
    A typical feed-forward NN with tanh hidden activations.
    """
    
    def __init__(self, input_size, output_size, 
                 activation=None, 
                 hidden_sizes=[], 
                 aggregator='sum',
                 dropout=0.5):
        """
        :param input_size: int
        :param output_size: int
        :param hidden_sizes: list of integers (dimensionality of hidden layers)
        :param aggregator: 'sum' or 'avg'
        :param dropout: dropout rate
        """
        super(FFEncoder, self).__init__()
        layers = []
        if hidden_sizes:                    
            for i, size in enumerate(hidden_sizes):
                layers.append(('dropout%d' % i, nn.Dropout(p=dropout)))
                layers.append(('linear%d' % i, nn.Linear(input_size, size)))
                layers.append(('tanh%d' % i, nn.Tanh()))
                input_size = size
        layers.append(('dropout', nn.Dropout(p=dropout)))
        layers.append(('linear', nn.Linear(input_size, output_size)))       
        self.layer = nn.Sequential(OrderedDict(layers))     
        self.activation = activation
        if not aggregator in ['sum', 'avg']:
            raise ValueError("I can only aggregate outputs using 'sum' or 'avg'")
        self.aggregator = aggregator
        
    def forward(self, x, mask, lengths):
        # [B, T, d]
        y = self.layer(x)
        if not self.activation is None:
            y = self.activation(y)
        # [B, d]
        s = (y * mask.unsqueeze(-1).float()).sum(dim=1)
        if self.aggregator == 'avg':
            s /= lengths.unsqueeze(-1).float()
        return y, s


from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class LSTMEncoder(Encoder):
    """
    This module encodes a sequence into a single vector using an LSTM,
     it also returns the hidden states at each time step.
    """

    def __init__(self, in_features, hidden_size: int = 200,
                 batch_first: bool = True,
                 bidirectional: bool = True):
        """
        :param in_features:
        :param hidden_size:
        :param batch_first:
        :param bidirectional:
        """
        super(LSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(in_features, hidden_size, batch_first=batch_first,
                            bidirectional=bidirectional)

    def forward(self, x, mask, lengths):
        """
        Encode sentence x
        :param x: sequence of word embeddings, shape [B, T, E]
        :param mask: byte mask that is 0 for invalid positions, shape [B, T]
        :param lengths: the lengths of each input sequence [B]
        :return:
        """

        packed_sequence = pack_padded_sequence(x, lengths, batch_first=True)
        outputs, (hx, cx) = self.lstm(packed_sequence)
        outputs, _ = pad_packed_sequence(outputs, batch_first=True)

        # classify from concatenation of final states
        if self.lstm.bidirectional:
            final = torch.cat([hx[-2], hx[-1]], dim=-1)
        else:  # classify from final state
            final = hx[-1]

        return outputs, final
    
    
def get_encoder(layer, in_features, hidden_size, bidirectional=True):
    """Returns the requested layer."""

    # TODO: make pass and average layers
    if layer == "pass":
        return Passthrough()
    elif layer == 'ff':
        return FFEncoder(in_features, 2 * hidden_size, hidden_sizes=[hidden_size], aggregator='sum')
    elif layer == "lstm":
        return LSTMEncoder(in_features, hidden_size,
                           bidirectional=bidirectional)
    else:
        raise ValueError("Unknown layer")

# Prior


Our prior is a Bernoulli with fixed parameter $0 < p_1 < 1$:

\begin{align}
Z_i & \sim \text{Bern}(p_1)
\end{align}

As we will be using Bernoulli priors and posteriors, it is a good idea to implement a Bernoulli class:

In [5]:
class Bernoulli:
    """
    This class encapsulates a collection of Bernoulli distributions. 
    Each Bernoulli is uniquely specified by p_1, where
        Bernoulli(X=x|p_1) = pow(p_1, x) + pow(1 - p_1, 1 - x)
    is the Bernoulli probability mass function (pmf).    
    """
    
    def __init__(self, logits=None, probs=None):
        """
        We can specify a Bernoulli distribution via a logit or a probability. 
         You need to specify at least one, and if you specify both, beware that
         in this implementation logits will be used.
         
        Recall that: probs = sigmoid(logits).
         
        :param logits: a tensor of logits (a logit is defined as log (p_1/p_0))
            where p_0 = 1 - p_1
        :param probs: a tensor of probabilities, each in (0, 1)
        
        """
        if probs is None and logits is None:
            raise ValueError('I need probabilities or logits')        
        if logits is None:
            self.probs = probs
        else:
            self.probs = torch.sigmoid(logits)
    
    def sample(self):
        """Returns a sample with the same shape as the parameters"""
        return torch.bernoulli(self.probs)
    
    def log_prob(self, x):
        """
        Assess the log probability of a sample. 
        :param x: either a single sample (0 or 1) or a tensor of samples with the same shape as the parameters.
        :returns: tensor with log probabilities with the same shape as parameters
            (if the input is a single sample we broadcast it to the shape of the parameters)
        """
        return x * torch.log(self.probs) + (1 - x) * torch.log(1. - self.probs)
    
    def kl(self, other: 'Bernoulli'):
        """
        Compute the KL divergence between two Bernoulli distributions (from self to other).
        
        :return: KL[self||other] with same shape parameters
        """
        p1 = self.probs
        p0 = 1. - self.probs
        q1 = other.probs
        q0 = 1. - other.probs        
        return p1 * (torch.log(p1) - torch.log(q1)) + p0 * (torch.log(p0) - torch.log(q0))

# Inference model


\begin{align}
Q(z|x) 
    &= \prod_{i=1}^{|x|} Q(z_i|x; \lambda) \\
    &= \prod_{i=1}^{|x|} \text{Bern}(z_i|g_i(x; \lambda)) 
\end{align}

where $g(x; \lambda)$ is a NN that maps from $x$ to $|x|$ Bernoulli parameters, each of which, is a probability value (thus $0 < g_i(x; \lambda) < 1$).

Note that though we could condition on $y$ for approximate posterior inference, we are opportunistically leaving it out. This way, $Q$ is directly available at test time for making predictions.

Here is an example design for $g$:
\begin{align}
\mathbf x_i &= \text{glove}(x_i) \\
\mathbf t_1^n, \mathbf h &= \text{encoder}(\mathbf x_1^n; \lambda_{\text{enc}}) \\
g_i(x; \lambda) &= \sigma(\text{dense}_1(\mathbf t_i; \lambda_{\text{output}}))
\end{align}
where
* $\text{glove}$ is a pre-trained embedding function
* $\text{dense}_1$ is a dense layer with a single output
* and $\sigma(\cdot)$ is the sigmoid function

Here we implement this product of Bernoulli distributions:

In [6]:
class ProductOfBernoullis(nn.Module):
    """
    This is an inference network that parameterises independent Bernoulli distributions.
    """

    def __init__(self,
                 embed:       nn.Embedding,
                 hidden_size: int = 200,
                 dropout:     float = 0.1,
                 layer:       str = "lstm"
                 ):

        super(ProductOfBernoullis, self).__init__()

        emb_size = embed.weight.shape[1]
        enc_size = hidden_size * 2

        self.embed_layer = nn.Sequential(embed, nn.Dropout(p=dropout))
        self.enc_layer = get_encoder(layer, emb_size, hidden_size)
        self.logit_layer = nn.Linear(enc_size, 1, bias=True)
        
        self.report_params()

    def report_params(self):
        count = 0
        for name, p in self.named_parameters():
            if p.requires_grad and "embed" not in name:
                count += np.prod(list(p.shape))
        print("{} #params: {}".format(self.__class__.__name__, count))

    def forward(self, x, mask) -> Bernoulli:
        """
        It takes a tensor of tokens (integers)
         and predicts a Bernoulli distribution for each position.
        
        :param x: [B, T]
        :param mask: [B, T]
        :returns: Bernoulli
        """

        # encode sentence
        # [B]
        lengths = mask.long().sum(1)
        # [B, T, E]
        emb = self.embed_layer(x)  
        # [B, T, d]
        h, _ = self.enc_layer(emb, mask, lengths)

        # compute parameters for Bernoulli p(z|x)
        # [B, T, 1] Bernoulli distributions
        logits = self.logit_layer(h)
        # [B, T]
        logits = logits.squeeze(-1)
        return Bernoulli(logits=logits)

# Classifier

The classifier encodes only a selection of the input, which we denote $x \odot z$, and parameterises a Categorical distribution over $5$ outcomes (sentiment levels):

\begin{align}
    Z_i & \sim \text{Bern}(p_1) \\
    Y|z,x &\sim \text{Cat}(f(x \odot z; \theta))
\end{align}

Here is an example design for $f$:

\begin{align}
\mathbf x_i &= z_i \, \text{glove}(x_i) \\
\mathbf t_1^n, \mathbf h &= \text{encoder}(\mathbf x_1^n; \theta_{\text{enc}}) \\
f(x \odot z; \theta) &= \text{softmax}(\text{dense}_5(\mathbf h; \theta_{\text{output}}))
\end{align}

where:
* $z_i$ either leaves $\mathbf x_i$ unchanged or turns it into a vector of zeros;
* the encoder only sees features from selected inputs, i.e. $x_i$ for which $z_i = 1$;
* $\text{dense}_5$ is a linear layer with $5$ outputs

In [7]:
class Categorical:
    
    def __init__(self, log_probs):
        # [B, K]: class probs
        self.log_probs = log_probs
        
    def log_prob(self, y):
        """
        :param y: [B] integers
        """
        return torch.gather(self.log_probs, 1, y.unsqueeze(-1))

In [8]:
class Classifier(nn.Module):
    """
    The Encoder takes an input text (and rationale z) and computes p(y|x,z)
    """

    def __init__(self,
                 embed:        nn.Embedding = None,
                 hidden_size:  int = 200,
                 output_size:  int = 1,
                 dropout:      float = 0.1,
                 layer:        str = "pass",
                 ):

        super(Classifier, self).__init__()

        emb_size = embed.weight.shape[1]
        enc_size = hidden_size * 2
        self.embed_layer = nn.Sequential(
            embed,
            nn.Dropout(p=dropout)
        )

        self.enc_layer = get_encoder(layer, emb_size, hidden_size)

        self.output_layer = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(enc_size, output_size),
            nn.LogSoftmax(dim=-1)
        )

        self.report_params()

    def report_params(self):
        count = 0
        for name, p in self.named_parameters():
            if p.requires_grad and "embed" not in name:
                count += np.prod(list(p.shape))
        print("{} #params: {}".format(self.__class__.__name__, count))

    def forward(self, x, mask, z=None) -> Categorical:

        rnn_mask = mask
        emb = self.embed_layer(x)

        # apply z to inputs
        if z is not None:
            # [B, T]
            rnn_mask = z > 0.
            # [B, T, 1]
            z_mask = z.unsqueeze(-1).float()
            # [B, T, E]
            emb = emb * z_mask

        lengths = mask.long().sum(1)

        # encode the sentence
        _, final = self.enc_layer(emb, rnn_mask, lengths)

        # predict sentiment from final state(s)
        log_probs = self.output_layer(final)        
        return Categorical(log_probs)

NVIL

In [9]:
from torch.nn.functional import softplus
#from discrete.util import get_z_stats


class RLModel(nn.Module):
    """
    Reimplementation of Lei et al. (2016). Rationalizing Neural Predictions
    for Stanford Sentiment.
    (Does classfication instead of regression.)

    Consists of:
    - Encoder that computes p(y | x, z)
    - Generator that computes p(z | x) independently or dependently with an RNN.
    """

    def __init__(self,
                 vocab:       object = None,
                 vocab_size:  int = 0,
                 emb_size:    int = 200,
                 hidden_size: int = 200,
                 output_size: int = 1,
                 prior_p1:    float = 0.1,
                 dropout:     float = 0.1,
                 layer_cls:   str = 'pass',
                 layer_inf:   str = 'lstm',
                 ):

        super(RLModel, self).__init__()

        self.vocab = vocab
        self.embed = embed = nn.Embedding(vocab_size, emb_size, padding_idx=1)

        # TODO: rename to obs_model
        self.cls_net = Classifier(
            embed=embed, hidden_size=hidden_size, output_size=output_size,
            dropout=dropout, layer=layer_cls)
        
        # TODO: rename to q_z
        self.inference_net = ProductOfBernoullis(
            embed=embed, hidden_size=hidden_size,
            dropout=dropout, layer=layer_inf)
        
        self.prior_p1 = prior_p1

    def predict(self, py, **kwargs):
        """
        Predict deterministically.
        :param x:
        :return: predictions, optional (dict with optional statistics)
        """
        assert not self.training, "should be in eval mode for prediction"
        return py.log_probs.argmax(-1)

    def forward(self, x):
        """
        Generate a sequence of zs with the Generator.
        Then predict with sentence x (zeroed out with z) using Encoder.

        :param x: [B, T] (that is, batch-major is assumed)
        :return:
        """
        mask = (x != 1)  # [B,T]

        qz = self.inference_net(x, mask)

        if self.training:  # sample
            # [B, T]
            z = qz.sample()
        else:  # deterministic
            # [B, T]
            # TODO: consider this
            z = (qz.probs >= 0.5).float()
            #z = q.sample()
            
        z = torch.where(mask, z, torch.zeros_like(z))
        
        py_xz = self.cls_net(x, mask, z)
        return py_xz, qz, z

    def get_loss(self, py, targets, 
                 q_z: Bernoulli, 
                 z, 
                 mask=None,
                 iter_i=0, 
                 kl_weight=1.0,
                 min_kl=0.0,
                 ll_mean=0.,
                 ll_std=1.,
                 **kwargs):
        """
        This computes the loss for the whole model.
        We stick to the variable names of the original code as much as
        possible.

        :param logits:
        :param targets:
        :param sparsity:
        :param coherent:
        :param mask:
        :param kwargs:
        :return:
        """
        assert mask is not None, "provide mask"

        lengths = mask.sum(1).float()
        batch_size = mask.size(0)
        terms = OrderedDict()

        # shape: [B]
        # log p(y|x,z) where z ~ q
        #one_hot_target = (targets.unsqueeze(-1) == torch.arange(5, device=device).reshape(1, 5)).float()            
        #ll = torch.sum(py.log_probs * one_hot_target, dim=-1)
        # [B]
        ll = py.log_prob(targets)
        
        # KL(q||p)
        # [B, T]
        #p_z = Bernoulli(probs=torch.full_like(q_z.probs, self.prior_p1))
        prior_p1 = np.random.beta(0.6, 0.6)
        p_z = Bernoulli(probs=torch.full_like(q_z.probs, prior_p1))
        kl = q_z.kl(p_z)
        kl = torch.where(mask, kl, torch.zeros_like(kl))
                
        # Compute the log density of the sample
        # [B, T]
        log_q_z = q_z.log_prob(z)
        log_q_z = torch.where(mask, log_q_z, torch.zeros_like(log_q_z))
        # We have independent Bernoullis, thus we just sum their log probabilities
        # [B]
        log_q_z = log_q_z.sum(1)
        
        # surrogate objective for score function estimator
        # [B]
        reward = (ll.detach() - torch.full_like(ll, ll_mean)) / torch.full_like(ll, ll_std)
        sf_surrogate = (reward * log_q_z)

        # Make terms in the ELBO
        # []
        ll = ll.mean()
        sf_surrogate = sf_surrogate.mean()
        # KL may require annealing and free-bits
        # [B]
        kl = kl.sum(dim=-1)
        kl_fb = torch.max(torch.full_like(kl, min_kl), kl)
        # []
        kl = kl.mean() 
        kl_fb = kl_fb.mean() 
        kl_fb = kl_fb * kl_weight
        
        terms['elbo'] = (ll - kl_fb).item()
        terms['ll'] = ll.item()
        terms['kl_fb'] = kl_fb.item()
        terms['kl'] = kl.item()
        terms['kl_weight'] = kl_weight
        terms['sf'] = sf_surrogate.item()
        terms['reward'] = reward.mean().item()
        terms['ll_mean'] = ll_mean
        terms['ll_std'] = ll_std
        terms['selected'] = (z.sum(1) / lengths).mean().item()
        terms['prior_p1'] = prior_p1
        terms['avg_p1'] = (torch.where(mask, q_z.probs, torch.zeros_like(q_z.probs)).sum() / mask.sum().float()).item()
            
        return - ll - sf_surrogate + kl_fb, terms

In [10]:
from collections import deque

class MovingStats:
    
    def __init__(self, memory=-1):
        self.data = deque([])
        self.memory = memory
        
    def append(self, value):
        if self.memory != 0:
            if self.memory > 0 and len(self.data) == self.memory:
                self.data.popleft()
            self.data.append(value)
        
    def mean(self):
        if len(self.data):
            return np.mean([x for x in self.data])
        else:
            return 0.
    
    def std(self):
        return 1.  # np.std(self.data) if len(self.data) > 1 else 1.
            

Training loop

In [11]:
from discrete.util import make_kv_string, get_minibatch, prepare_minibatch, print_parameters

In [15]:
from discrete.sstutil import examplereader, Vocabulary, load_glove
from collections import OrderedDict
import torch.optim
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
import time
from discrete.evaluate import evaluate
#from tensorboardX import SummaryWriter

cfg = dict()

cfg['num_iterations'] = -20  # use negative for epochs and positive for iterations
cfg['print_every'] = 100
cfg['eval_every'] = -1
cfg['batch_size'] = 25
cfg['eval_batch_size'] = 25
cfg['subphrases'] = False
cfg['min_phrase_length'] = 2
cfg['lowercase'] = True
cfg['word_vectors'] = 'data/sst/glove.840B.300d.filtered.txt'
cfg['fix_emb'] = True
cfg['embed_size'] = 300
cfg['hidden_size'] = 150
cfg['num_layers'] = 1
cfg['dropout'] = 0.5
cfg['layer_inf'] = 'pass'
cfg['layer_cls'] = 'pass'
cfg['save_path'] = 'data/results'
cfg['baseline_memory'] = 1000
cfg['prior_p1'] = 0.3
cfg['min_kl'] = 0.
cfg['kl_weight'] = 0.
cfg['kl_inc'] = 0.00001
# Optimisation options: leave as is
cfg['lr'] = 0.0002
cfg['weight_decay'] = 1e-5
cfg['lr_decay'] = 0.5
cfg['patience'] = 5
cfg['cooldown'] = 5
cfg['threshold'] = 1e-4
cfg['min_lr'] = 1e-5
cfg['max_grad_norm'] = 5.


print('# Configuration')
for k, v in cfg.items():
    print("{:20} : {:10}".format(k, v))
    
# Let's load the data into memory.
print("Loading data")
train_data = list(examplereader(
    "data/sst/train.txt",
    lower=cfg['lowercase'], 
    subphrases=cfg['subphrases'],
    min_length=cfg['min_phrase_length']))
dev_data = list(examplereader("data/sst/dev.txt", lower=cfg['lowercase']))
test_data = list(examplereader("data/sst/test.txt", lower=cfg['lowercase']))

print("train", len(train_data))
print("dev", len(dev_data))
print("test", len(test_data))

iters_per_epoch = len(train_data) // cfg["batch_size"]

if cfg["eval_every"] == -1:
    eval_every = iters_per_epoch
    print("Set eval_every to {}".format(iters_per_epoch))

if cfg["num_iterations"] < 0:
    num_iterations = iters_per_epoch * -1 * cfg["num_iterations"]
    print("Set num_iterations to {}".format(num_iterations))

print('\n# Example')
example = dev_data[0]
print("First dev example:", example)
print("First dev example tokens:", example.tokens)
print("First dev example label:", example.label)


def train():
    vocab = Vocabulary()  # populated by load_glove
    glove_path = cfg["word_vectors"]
    vectors = load_glove(glove_path, vocab)

    #writer = SummaryWriter(log_dir=cfg["save_path"])

    # Map the sentiment labels 0-4 to a more readable form (and the opposite)
    i2t = ["very negative", "negative", "neutral", "positive", "very positive"]
    t2i = OrderedDict({p: i for p, i in zip(i2t, range(len(i2t)))})


    print('\n# Constructing model')
    model = RLModel(
        vocab_size=len(vocab.w2i), 
        emb_size=cfg["embed_size"],
        hidden_size=cfg["hidden_size"], 
        output_size=len(t2i),
        prior_p1=cfg['prior_p1'],
        vocab=vocab, 
        dropout=cfg["dropout"], 
        layer_cls=cfg["layer_cls"],
        layer_inf=cfg["layer_inf"])

    print('\n# Loading embeddings')
    with torch.no_grad():
        model.embed.weight.data.copy_(torch.from_numpy(vectors))
        if cfg["fix_emb"]:
            print("fixed word embeddings")
            model.embed.weight.requires_grad = False
        model.embed.weight[1] = 0.  # padding zero


    optimizer = Adam(model.parameters(), lr=cfg["lr"],
                     weight_decay=cfg["weight_decay"])

    # lagrange optimizer (if there are lagrange lambdas to optimize)
    #if len(model.lagrange_parameters()) > 0:
    #    lagrange_optimizer = Adam(model.lagrange_parameters(), lr=cfg["lr"])
    #else:
    #    lagrange_optimizer = None

    scheduler = ReduceLROnPlateau(
        optimizer, mode="min", factor=cfg["lr_decay"], patience=cfg["patience"],
        verbose=True, cooldown=cfg["cooldown"], threshold=cfg["threshold"],
        min_lr=cfg["min_lr"])

    iter_i = 0
    train_loss = 0.
    print_num = 0
    start = time.time()
    losses = []
    accuracies = []
    best_eval = 1.0e9
    best_iter = 0

    model = model.to(device)

    # print model
    print(model)
    print_parameters(model)

    batch_size = cfg['batch_size']
    eval_batch_size = cfg['eval_batch_size']
    print_every = cfg['print_every']

    kl_inc = cfg['kl_inc']
    kl_weight = cfg['kl_weight']
    min_kl = cfg['min_kl']
    ll_moving_stats = MovingStats(cfg['baseline_memory'])

    while True:  # when we run out of examples, shuffle and continue
        for batch in get_minibatch(train_data, batch_size=batch_size, shuffle=True):

            epoch = iter_i // iters_per_epoch

            # forward pass
            model.train()
            x, targets, _ = prepare_minibatch(batch, model.vocab, device=device)

            # with autograd.detect_anomaly():

            py, q_z, z = model(x)

            mask = (x != 1)
            # "KL annealing"
            kl_weight += kl_inc
            if kl_weight > 1.:
                kl_weight = 1.0
                
                
            with autograd.detect_anomaly():
                loss, terms = model.get_loss(
                    py, 
                    targets, 
                    q_z=q_z,
                    z=z,
                    mask=mask, 
                    kl_weight=kl_weight,
                    min_kl=min_kl,
                    ll_mean=ll_moving_stats.mean(),
                    ll_std=ll_moving_stats.std(),
                    iter_i=iter_i)

            train_loss += loss.item()
            ll_moving_stats.append(terms['ll'])

            # backward pass
            model.zero_grad()  # erase previous gradients

            loss.backward()  # compute new gradients

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=cfg['max_grad_norm'])

            # update weights
            optimizer.step()

            print_num += 1
            iter_i += 1

            # print info
            if iter_i % print_every == 0:

                train_loss = train_loss / print_every
                #writer.add_scalar('data/train_loss', train_loss, iter_i)
                #for k, v in loss_optional.items():
                #    writer.add_scalar('data/'+k, v, iter_i)

                print_str = make_kv_string(terms)
                print("Epoch %r Iter %r loss=%.4f %s" %
                      (epoch, iter_i, train_loss, print_str))
                losses.append(train_loss)
                print_num = 0
                train_loss = 0.

            # evaluate
            if iter_i % eval_every == 0:

                dev_eval, rationales = evaluate(
                    model, dev_data, 
                    batch_size=eval_batch_size, 
                    device=device,
                    cfg=cfg, iter_i=iter_i)
                accuracies.append(dev_eval["acc"])

                #for k, v in dev_eval.items():
                #    writer.add_scalar('data/dev/'+k, v, iter_i)

                print("\n# epoch %r iter %r: dev %s" % (
                    epoch, iter_i, make_kv_string(dev_eval)))
                
                for exid in range(3):
                    print(' dev%d [gold=%d,pred=%d]:' % (exid, dev_data[exid].label, rationales[exid][1]),  
                          ' '.join(rationales[exid][0]))
                print()

                #test_eval = evaluate(
                #    model, test_data, batch_size=eval_batch_size, device=device,
                #    cfg=cfg, iter_i=iter_i)
                #for k, v in test_eval.items():
                #    writer.add_scalar('data/test/'+k, v, iter_i)

                #print("# epoch %r iter %r: tst %s" % (
                #    epoch, iter_i, make_kv_string(test_eval)))

                # adjust learning rate

                scheduler.step(dev_eval["loss"])

# Configuration
dropout              :        0.5
min_phrase_length    :          2
layer_cls            : pass      
max_grad_norm        :        5.0
min_lr               :      1e-05
cooldown             :          5
prior_p1             :        0.3
hidden_size          :        150
num_layers           :          1
baseline_memory      :       1000
lr                   :     0.0002
print_every          :        100
batch_size           :         25
eval_batch_size      :         25
word_vectors         : data/sst/glove.840B.300d.filtered.txt
lr_decay             :        0.5
min_kl               :        0.0
layer_inf            : pass      
num_iterations       :        -20
threshold            :     0.0001
kl_weight            :        0.0
subphrases           :          0
save_path            : data/results
weight_decay         :      1e-05
lowercase            :          1
embed_size           :        300
fix_emb              :          1
patience             :          5
eva

In [16]:
train()


# Constructing model
Classifier #params: 1505
ProductOfBernoullis #params: 301

# Loading embeddings
fixed word embeddings
RLModel(
  (embed): Embedding(20727, 300, padding_idx=1)
  (cls_net): Classifier(
    (embed_layer): Sequential(
      (0): Embedding(20727, 300, padding_idx=1)
      (1): Dropout(p=0.5)
    )
    (enc_layer): Passthrough()
    (output_layer): Sequential(
      (0): Dropout(p=0.5)
      (1): Linear(in_features=300, out_features=5, bias=True)
      (2): LogSoftmax()
    )
  )
  (inference_net): ProductOfBernoullis(
    (embed_layer): Sequential(
      (0): Embedding(20727, 300, padding_idx=1)
      (1): Dropout(p=0.5)
    )
    (enc_layer): Passthrough()
    (logit_layer): Linear(in_features=300, out_features=1, bias=True)
  )
)
embed.weight             [20727, 300] requires_grad=False
cls_net.output_layer.1.weight [5, 300]     requires_grad=True
cls_net.output_layer.1.bias [5]          requires_grad=True
inference_net.logit_layer.weight [1, 300]     requires_grad=

Epoch 6 Iter 2100 loss=2.0847 elbo -1.5017 ll -1.3368 kl_fb 0.1649 kl 7.8519 kl_weight 0.0210 sf -2.6152 reward 0.1998 ll_mean -1.5367 ll_std 1.0000 selected 0.5177 prior_p1 0.1376 avg_p1 0.5166
Epoch 6 Iter 2200 loss=2.1827 elbo -1.6340 ll -1.6161 kl_fb 0.0178 kl 0.8108 kl_weight 0.0220 sf 1.3173 reward -0.0877 ll_mean -1.5284 ll_std 1.0000 selected 0.5431 prior_p1 0.3864 avg_p1 0.5123
Epoch 6 Iter 2300 loss=2.0484 elbo -1.4581 ll -1.4240 kl_fb 0.0341 kl 1.4824 kl_weight 0.0230 sf -1.3011 reward 0.0993 ll_mean -1.5233 ll_std 1.0000 selected 0.4623 prior_p1 0.6959 avg_p1 0.5130

# epoch 6 iter 2387: dev loss -4.3817 elbo -13.6711 ll -1.4459 kl_fb 12.2252 kl 12.2252 kl_weight 1.0000 sf 18.0528 reward -1.4459 ll_mean 0.0000 ll_std 1.0000 selected 0.6400 prior_p1 0.5293 avg_p1 0.5117 acc 0.3669
 dev0 [gold=3,pred=1]: **it** **'s** **a** lovely **film** with lovely performances by buy and accorsi **.**
 dev1 [gold=2,pred=3]: **no** **one** goes **unindicted** here , **which** **is** probab

Epoch 12 Iter 4400 loss=2.0807 elbo -1.4444 ll -1.3191 kl_fb 0.1253 kl 2.8467 kl_weight 0.0440 sf -2.0775 reward 0.1589 ll_mean -1.4780 ll_std 1.0000 selected 0.4905 prior_p1 0.7648 avg_p1 0.5162

# epoch 12 iter 4433: dev loss -5.1760 elbo -12.2947 ll -1.4104 kl_fb 10.8843 kl 10.8843 kl_weight 1.0000 sf 17.4707 reward -1.4104 ll_mean 0.0000 ll_std 1.0000 selected 0.8283 prior_p1 0.5592 avg_p1 0.5219 acc 0.3815
 dev0 [gold=3,pred=1]: **it** **'s** **a** **lovely** **film** **with** **lovely** performances by **buy** **and** accorsi **.**
 dev1 [gold=2,pred=3]: **no** **one** **goes** **unindicted** here **,** **which** **is** **probably** **for** **the** **best** **.**
 dev2 [gold=3,pred=1]: **and** **if** you 're **not** nearly moved **to** tears by **a** couple **of** scenes **,** you 've **got** **ice** **water** **in** your veins **.**

Shuffling training data
Epoch 13 Iter 4500 loss=2.1096 elbo -2.0735 ll -1.3621 kl_fb 0.7115 kl 15.8104 kl_weight 0.0450 sf -1.3192 reward 0.1135 ll

Shuffling training data
Epoch 19 Iter 6500 loss=2.2537 elbo -3.7400 ll -1.5289 kl_fb 2.2111 kl 34.0168 kl_weight 0.0650 sf 0.7578 reward -0.0551 ll_mean -1.4738 ll_std 1.0000 selected 0.4726 prior_p1 0.9908 avg_p1 0.4892
Epoch 19 Iter 6600 loss=2.4892 elbo -1.9896 ll -1.3776 kl_fb 0.6120 kl 9.2722 kl_weight 0.0660 sf -1.2270 reward 0.0959 ll_mean -1.4735 ll_std 1.0000 selected 0.5122 prior_p1 0.8935 avg_p1 0.4928
Epoch 19 Iter 6700 loss=2.3402 elbo -2.9910 ll -1.5508 kl_fb 1.4403 kl 21.4968 kl_weight 0.0670 sf 0.9883 reward -0.0759 ll_mean -1.4749 ll_std 1.0000 selected 0.4866 prior_p1 0.0248 avg_p1 0.4907
Epoch 19 Iter 6800 loss=2.2215 elbo -2.0362 ll -1.5903 kl_fb 0.4459 kl 6.5571 kl_weight 0.0680 sf 1.5986 reward -0.1156 ll_mean -1.4748 ll_std 1.0000 selected 0.5129 prior_p1 0.1490 avg_p1 0.4937

# epoch 19 iter 6820: dev loss -6.6602 elbo -11.6585 ll -1.4160 kl_fb 10.2425 kl 10.2425 kl_weight 1.0000 sf 18.3187 reward -1.4160 ll_mean 0.0000 ll_std 1.0000 selected 0.3162 prior_p1 0.5

Epoch 25 Iter 8700 loss=2.3865 elbo -1.8503 ll -1.3377 kl_fb 0.5126 kl 5.8922 kl_weight 0.0870 sf -1.5905 reward 0.1317 ll_mean -1.4694 ll_std 1.0000 selected 0.5075 prior_p1 0.1580 avg_p1 0.5116
Epoch 25 Iter 8800 loss=2.4643 elbo -8.3842 ll -1.4520 kl_fb 6.9322 kl 78.7748 kl_weight 0.0880 sf -0.1934 reward 0.0150 ll_mean -1.4671 ll_std 1.0000 selected 0.5187 prior_p1 0.0001 avg_p1 0.5121

# epoch 25 iter 8866: dev loss -2.9886 elbo -14.8390 ll -1.3897 kl_fb 13.4493 kl 13.4493 kl_weight 1.0000 sf 17.8276 reward -1.3897 ll_mean 0.0000 ll_std 1.0000 selected 0.7920 prior_p1 0.4816 avg_p1 0.5103 acc 0.3906
 dev0 [gold=3,pred=1]: **it** **'s** **a** **lovely** film **with** **lovely** performances **by** **buy** **and** accorsi **.**
 dev1 [gold=2,pred=3]: **no** **one** **goes** **unindicted** here **,** **which** **is** probably **for** **the** **best** **.**
 dev2 [gold=3,pred=1]: **and** if you 're **not** **nearly** moved **to** tears **by** **a** couple **of** scenes **,** you 've *

Epoch 31 Iter 10900 loss=2.4430 elbo -5.0897 ll -1.4887 kl_fb 3.6010 kl 33.0369 kl_weight 0.1090 sf 0.3278 reward -0.0254 ll_mean -1.4633 ll_std 1.0000 selected 0.5185 prior_p1 0.0075 avg_p1 0.5005

# epoch 31 iter 10912: dev loss -8.6559 elbo -9.6850 ll -1.4050 kl_fb 8.2800 kl 8.2800 kl_weight 1.0000 sf 18.3409 reward -1.4050 ll_mean 0.0000 ll_std 1.0000 selected 0.3923 prior_p1 0.5389 avg_p1 0.5002 acc 0.3987
 dev0 [gold=3,pred=1]: it 's **a** **lovely** film with **lovely** performances by buy and accorsi .
 dev1 [gold=2,pred=1]: **no** one **goes** **unindicted** here , **which** is probably for the **best** .
 dev2 [gold=3,pred=1]: and if you 're **not** **nearly** moved **to** tears by **a** couple of scenes , you 've got **ice** **water** in your veins .

Shuffling training data
Epoch 32 Iter 11000 loss=2.1783 elbo -1.3947 ll -1.3790 kl_fb 0.0156 kl 0.1422 kl_weight 0.1100 sf -1.1954 reward 0.0860 ll_mean -1.4651 ll_std 1.0000 selected 0.5294 prior_p1 0.4474 avg_p1 0.5007
Epoch 

Shuffling training data
Epoch 38 Iter 13000 loss=2.7358 elbo -1.4680 ll -1.3794 kl_fb 0.0886 kl 0.6819 kl_weight 0.1300 sf -1.3301 reward 0.0882 ll_mean -1.4676 ll_std 1.0000 selected 0.5025 prior_p1 0.6155 avg_p1 0.4944
Epoch 38 Iter 13100 loss=2.4792 elbo -3.8651 ll -1.4529 kl_fb 2.4122 kl 18.4134 kl_weight 0.1310 sf -0.2189 reward 0.0145 ll_mean -1.4674 ll_std 1.0000 selected 0.4579 prior_p1 0.9492 avg_p1 0.4934
Epoch 38 Iter 13200 loss=2.9621 elbo -1.6334 ll -1.4051 kl_fb 0.2283 kl 1.7295 kl_weight 0.1320 sf -0.9717 reward 0.0630 ll_mean -1.4682 ll_std 1.0000 selected 0.5027 prior_p1 0.6818 avg_p1 0.4933

# epoch 38 iter 13299: dev loss -12.7194 elbo -6.3585 ll -1.4818 kl_fb 4.8767 kl 4.8767 kl_weight 1.0000 sf 19.0779 reward -1.4818 ll_mean 0.0000 ll_std 1.0000 selected 0.1641 prior_p1 0.4150 avg_p1 0.4908 acc 0.3497
 dev0 [gold=3,pred=1]: it 's a lovely film with lovely performances by buy and accorsi .
 dev1 [gold=2,pred=1]: **no** one goes **unindicted** here , which is probabl

Shuffling training data
Epoch 45 Iter 15400 loss=3.6265 elbo -1.7954 ll -1.2784 kl_fb 0.5170 kl 3.3571 kl_weight 0.1540 sf -2.4823 reward 0.1864 ll_mean -1.4648 ll_std 1.0000 selected 0.4972 prior_p1 0.2248 avg_p1 0.4935
Epoch 45 Iter 15500 loss=3.1655 elbo -4.0851 ll -1.3799 kl_fb 2.7052 kl 17.4527 kl_weight 0.1550 sf -0.9733 reward 0.0844 ll_mean -1.4644 ll_std 1.0000 selected 0.4423 prior_p1 0.9664 avg_p1 0.4928
Epoch 45 Iter 15600 loss=3.0056 elbo -1.5232 ll -1.5145 kl_fb 0.0087 kl 0.0559 kl_weight 0.1560 sf 0.6944 reward -0.0508 ll_mean -1.4637 ll_std 1.0000 selected 0.4636 prior_p1 0.4648 avg_p1 0.4934

# epoch 45 iter 15686: dev loss -7.6600 elbo -11.4069 ll -1.4661 kl_fb 9.9409 kl 9.9409 kl_weight 1.0000 sf 19.0669 reward -1.4661 ll_mean 0.0000 ll_std 1.0000 selected 0.2033 prior_p1 0.3942 avg_p1 0.4946 acc 0.3533
 dev0 [gold=3,pred=1]: it 's a lovely film with lovely performances by buy and accorsi .
 dev1 [gold=2,pred=1]: **no** one goes **unindicted** here , **which** is pro


# epoch 51 iter 17732: dev loss -7.0009 elbo -11.9418 ll -1.4546 kl_fb 10.4871 kl 10.4871 kl_weight 1.0000 sf 18.9426 reward -1.4546 ll_mean 0.0000 ll_std 1.0000 selected 0.2147 prior_p1 0.5016 avg_p1 0.4950 acc 0.3615
 dev0 [gold=3,pred=1]: it 's a lovely film with lovely performances by buy and accorsi .
 dev1 [gold=2,pred=1]: **no** one goes **unindicted** here , **which** is probably for the **best** .
 dev2 [gold=3,pred=1]: and if you 're not nearly moved to tears by a couple of scenes , you 've got **ice** **water** in your veins .

Shuffling training data
Epoch 52 Iter 17800 loss=3.3032 elbo -8.3631 ll -1.6030 kl_fb 6.7601 kl 37.9783 kl_weight 0.1780 sf 2.2548 reward -0.1427 ll_mean -1.4602 ll_std 1.0000 selected 0.5024 prior_p1 0.0085 avg_p1 0.4949
Epoch 52 Iter 17900 loss=3.4659 elbo -1.4911 ll -1.4421 kl_fb 0.0490 kl 0.2737 kl_weight 0.1790 sf -0.2574 reward 0.0197 ll_mean -1.4617 ll_std 1.0000 selected 0.5141 prior_p1 0.4143 avg_p1 0.4954
Epoch 52 Iter 18000 loss=3.3646 elb

Shuffling training data
Epoch 58 Iter 19900 loss=3.9244 elbo -3.3866 ll -1.2496 kl_fb 2.1370 kl 10.7387 kl_weight 0.1990 sf -3.1588 reward 0.2119 ll_mean -1.4615 ll_std 1.0000 selected 0.5031 prior_p1 0.1023 avg_p1 0.4988
Epoch 58 Iter 20000 loss=3.6962 elbo -3.0833 ll -1.4653 kl_fb 1.6180 kl 8.0900 kl_weight 0.2000 sf 0.0349 reward -0.0028 ll_mean -1.4625 ll_std 1.0000 selected 0.4652 prior_p1 0.1125 avg_p1 0.4954
Epoch 58 Iter 20100 loss=4.0284 elbo -5.1969 ll -1.4279 kl_fb 3.7690 kl 18.7514 kl_weight 0.2010 sf -0.4458 reward 0.0362 ll_mean -1.4641 ll_std 1.0000 selected 0.4769 prior_p1 0.0309 avg_p1 0.4962

# epoch 58 iter 20119: dev loss -7.6156 elbo -11.3154 ll -1.4492 kl_fb 9.8662 kl 9.8662 kl_weight 1.0000 sf 18.9310 reward -1.4492 ll_mean 0.0000 ll_std 1.0000 selected 0.2319 prior_p1 0.4824 avg_p1 0.4961 acc 0.3633
 dev0 [gold=3,pred=1]: it 's a **lovely** film with **lovely** performances by buy and accorsi .
 dev1 [gold=2,pred=1]: **no** one goes **unindicted** here , **which

Epoch 65 Iter 22200 loss=4.2729 elbo -5.4005 ll -1.6258 kl_fb 3.7747 kl 17.0030 kl_weight 0.2220 sf 2.2262 reward -0.1674 ll_mean -1.4584 ll_std 1.0000 selected 0.4981 prior_p1 0.9527 avg_p1 0.4920
Shuffling training data
Epoch 65 Iter 22300 loss=3.8027 elbo -1.5324 ll -1.4424 kl_fb 0.0900 kl 0.4036 kl_weight 0.2230 sf -0.2098 reward 0.0173 ll_mean -1.4597 ll_std 1.0000 selected 0.4258 prior_p1 0.5970 avg_p1 0.4933
Epoch 65 Iter 22400 loss=3.7849 elbo -1.5863 ll -1.5708 kl_fb 0.0155 kl 0.0690 kl_weight 0.2240 sf 1.3019 reward -0.1100 ll_mean -1.4608 ll_std 1.0000 selected 0.4689 prior_p1 0.5322 avg_p1 0.4938
Epoch 65 Iter 22500 loss=3.7627 elbo -1.9441 ll -1.4450 kl_fb 0.4991 kl 2.2182 kl_weight 0.2250 sf -0.2055 reward 0.0149 ll_mean -1.4599 ll_std 1.0000 selected 0.5183 prior_p1 0.7171 avg_p1 0.4938

# epoch 65 iter 22506: dev loss -6.4991 elbo -12.6782 ll -1.4734 kl_fb 11.2049 kl 11.2049 kl_weight 1.0000 sf 19.1773 reward -1.4734 ll_mean 0.0000 ll_std 1.0000 selected 0.1819 prior_p1

Epoch 72 Iter 24600 loss=3.7289 elbo -1.7070 ll -1.4859 kl_fb 0.2211 kl 0.8986 kl_weight 0.2460 sf 0.3285 reward -0.0224 ll_mean -1.4635 ll_std 1.0000 selected 0.4708 prior_p1 0.6344 avg_p1 0.4933
Shuffling training data
Epoch 72 Iter 24700 loss=4.3810 elbo -1.5332 ll -1.3364 kl_fb 0.1968 kl 0.7968 kl_weight 0.2470 sf -1.6223 reward 0.1259 ll_mean -1.4623 ll_std 1.0000 selected 0.5073 prior_p1 0.3533 avg_p1 0.4949
Epoch 72 Iter 24800 loss=3.4580 elbo -2.0120 ll -1.4074 kl_fb 0.6046 kl 2.4378 kl_weight 0.2480 sf -0.6961 reward 0.0558 ll_mean -1.4632 ll_std 1.0000 selected 0.5002 prior_p1 0.2521 avg_p1 0.4931

# epoch 72 iter 24893: dev loss -9.0610 elbo -10.1462 ll -1.4754 kl_fb 8.6708 kl 8.6708 kl_weight 1.0000 sf 19.2072 reward -1.4754 ll_mean 0.0000 ll_std 1.0000 selected 0.1820 prior_p1 0.4582 avg_p1 0.4941 acc 0.3415
 dev0 [gold=3,pred=1]: it 's a lovely film with lovely performances by buy and accorsi .
 dev1 [gold=2,pred=1]: **no** one goes **unindicted** here , which is probably


# epoch 78 iter 26939: dev loss -11.6278 elbo -6.9511 ll -1.4141 kl_fb 5.5371 kl 5.5371 kl_weight 1.0000 sf 18.5790 reward -1.4141 ll_mean 0.0000 ll_std 1.0000 selected 0.3708 prior_p1 0.4932 avg_p1 0.4998 acc 0.4015
 dev0 [gold=3,pred=1]: it 's a **lovely** film with **lovely** performances by **buy** **and** accorsi .
 dev1 [gold=2,pred=3]: **no** one goes **unindicted** here , **which** is probably for the **best** .
 dev2 [gold=3,pred=1]: **and** if you 're **not** **nearly** moved **to** tears by a couple of scenes , you 've **got** **ice** **water** in your veins .

Epoch 79 Iter 27000 loss=4.3184 elbo -1.8866 ll -1.3661 kl_fb 0.5205 kl 1.9280 kl_weight 0.2700 sf -1.2727 reward 0.0968 ll_mean -1.4628 ll_std 1.0000 selected 0.5380 prior_p1 0.7131 avg_p1 0.4993
Shuffling training data
Epoch 79 Iter 27100 loss=4.0786 elbo -2.2002 ll -1.8238 kl_fb 0.3763 kl 1.3887 kl_weight 0.2710 sf 4.9773 reward -0.3623 ll_mean -1.4615 ll_std 1.0000 selected 0.5029 prior_p1 0.3205 avg_p1 0.5001
Ep

Shuffling training data
Epoch 85 Iter 29100 loss=3.7976 elbo -1.7880 ll -1.5357 kl_fb 0.2522 kl 0.8668 kl_weight 0.2910 sf 1.0023 reward -0.0729 ll_mean -1.4628 ll_std 1.0000 selected 0.4858 prior_p1 0.3537 avg_p1 0.4964
Epoch 85 Iter 29200 loss=4.3479 elbo -13.9334 ll -1.3144 kl_fb 12.6190 kl 43.2158 kl_weight 0.2920 sf -1.9456 reward 0.1493 ll_mean -1.4637 ll_std 1.0000 selected 0.5534 prior_p1 0.9973 avg_p1 0.4970
Epoch 85 Iter 29300 loss=4.4674 elbo -4.7718 ll -1.3666 kl_fb 3.4052 kl 11.6218 kl_weight 0.2930 sf -1.1538 reward 0.0975 ll_mean -1.4641 ll_std 1.0000 selected 0.5010 prior_p1 0.0679 avg_p1 0.4966

# epoch 85 iter 29326: dev loss -7.0643 elbo -11.8296 ll -1.4411 kl_fb 10.3885 kl 10.3885 kl_weight 1.0000 sf 18.8939 reward -1.4411 ll_mean 0.0000 ll_std 1.0000 selected 0.2436 prior_p1 0.4709 avg_p1 0.4972 acc 0.3733
 dev0 [gold=3,pred=1]: it 's a **lovely** film with **lovely** performances by buy and accorsi .
 dev1 [gold=2,pred=1]: **no** one goes **unindicted** here , **w

Epoch 92 Iter 31400 loss=5.1858 elbo -2.6707 ll -1.5014 kl_fb 1.1693 kl 3.7240 kl_weight 0.3140 sf 0.5648 reward -0.0391 ll_mean -1.4623 ll_std 1.0000 selected 0.5207 prior_p1 0.7712 avg_p1 0.4975
Shuffling training data
Epoch 92 Iter 31500 loss=5.0097 elbo -1.4049 ll -1.3812 kl_fb 0.0237 kl 0.0753 kl_weight 0.3150 sf -1.0995 reward 0.0829 ll_mean -1.4640 ll_std 1.0000 selected 0.4604 prior_p1 0.4591 avg_p1 0.4987
Epoch 92 Iter 31600 loss=4.8026 elbo -1.3288 ll -1.3184 kl_fb 0.0104 kl 0.0328 kl_weight 0.3160 sf -1.8441 reward 0.1449 ll_mean -1.4633 ll_std 1.0000 selected 0.5081 prior_p1 0.4740 avg_p1 0.4950
Epoch 92 Iter 31700 loss=4.1476 elbo -8.9344 ll -1.5486 kl_fb 7.3858 kl 23.2991 kl_weight 0.3170 sf 0.9870 reward -0.0871 ll_mean -1.4615 ll_std 1.0000 selected 0.5132 prior_p1 0.9847 avg_p1 0.4961

# epoch 92 iter 31713: dev loss -3.9833 elbo -15.0333 ll -1.4513 kl_fb 13.5820 kl 13.5820 kl_weight 1.0000 sf 19.0166 reward -1.4513 ll_mean 0.0000 ll_std 1.0000 selected 0.2301 prior_p1


# epoch 98 iter 33759: dev loss -8.8822 elbo -10.1272 ll -1.4513 kl_fb 8.6759 kl 8.6759 kl_weight 1.0000 sf 19.0094 reward -1.4513 ll_mean 0.0000 ll_std 1.0000 selected 0.2220 prior_p1 0.4119 avg_p1 0.4962 acc 0.3660
 dev0 [gold=3,pred=1]: it 's a **lovely** film with **lovely** performances by buy and accorsi .
 dev1 [gold=2,pred=1]: **no** one goes **unindicted** here , **which** is probably for the **best** .
 dev2 [gold=3,pred=1]: and if you 're not nearly moved to tears by a couple of scenes , you 've got **ice** **water** in your veins .

Epoch 99 Iter 33800 loss=5.4842 elbo -9.7389 ll -1.3594 kl_fb 8.3795 kl 24.7915 kl_weight 0.3380 sf -1.4791 reward 0.1031 ll_mean -1.4625 ll_std 1.0000 selected 0.5026 prior_p1 0.9762 avg_p1 0.4976
Shuffling training data
Epoch 99 Iter 33900 loss=6.2157 elbo -4.0251 ll -1.3920 kl_fb 2.6330 kl 7.7670 kl_weight 0.3390 sf -0.8431 reward 0.0693 ll_mean -1.4614 ll_std 1.0000 selected 0.4487 prior_p1 0.8798 avg_p1 0.4949
Epoch 99 Iter 34000 loss=4.92

Epoch 105 Iter 35900 loss=5.6847 elbo -6.4695 ll -1.4881 kl_fb 4.9814 kl 13.8758 kl_weight 0.3590 sf 0.3266 reward -0.0243 ll_mean -1.4638 ll_std 1.0000 selected 0.4905 prior_p1 0.9360 avg_p1 0.4988
Shuffling training data
Epoch 105 Iter 36000 loss=4.9171 elbo -1.8421 ll -1.5171 kl_fb 0.3250 kl 0.9028 kl_weight 0.3600 sf 0.7894 reward -0.0537 ll_mean -1.4634 ll_std 1.0000 selected 0.5178 prior_p1 0.6411 avg_p1 0.4997
Epoch 105 Iter 36100 loss=5.8786 elbo -5.3338 ll -1.3759 kl_fb 3.9579 kl 10.9637 kl_weight 0.3610 sf -1.1516 reward 0.0862 ll_mean -1.4621 ll_std 1.0000 selected 0.4712 prior_p1 0.0874 avg_p1 0.4982

# epoch 105 iter 36146: dev loss -8.1157 elbo -10.7740 ll -1.4400 kl_fb 9.3340 kl 9.3340 kl_weight 1.0000 sf 18.8897 reward -1.4400 ll_mean 0.0000 ll_std 1.0000 selected 0.2360 prior_p1 0.4655 avg_p1 0.4970 acc 0.3806
 dev0 [gold=3,pred=1]: it 's a **lovely** film with **lovely** performances by buy and accorsi .
 dev1 [gold=2,pred=1]: **no** one goes **unindicted** here , **w

Epoch 112 Iter 38300 loss=4.6008 elbo -1.4855 ll -1.3932 kl_fb 0.0923 kl 0.2409 kl_weight 0.3830 sf -1.0022 reward 0.0687 ll_mean -1.4619 ll_std 1.0000 selected 0.5177 prior_p1 0.5713 avg_p1 0.4982
Shuffling training data
Epoch 112 Iter 38400 loss=5.6154 elbo -7.1746 ll -1.4669 kl_fb 5.7077 kl 14.8638 kl_weight 0.3840 sf 0.0715 reward -0.0059 ll_mean -1.4610 ll_std 1.0000 selected 0.5196 prior_p1 0.9507 avg_p1 0.4965
Epoch 112 Iter 38500 loss=6.0404 elbo -5.1848 ll -1.4659 kl_fb 3.7190 kl 9.6596 kl_weight 0.3850 sf 0.0745 reward -0.0053 ll_mean -1.4606 ll_std 1.0000 selected 0.4952 prior_p1 0.8899 avg_p1 0.4982

# epoch 112 iter 38533: dev loss -7.6422 elbo -11.2872 ll -1.4405 kl_fb 9.8467 kl 9.8467 kl_weight 1.0000 sf 18.9294 reward -1.4405 ll_mean 0.0000 ll_std 1.0000 selected 0.2437 prior_p1 0.5303 avg_p1 0.4976 acc 0.3769
 dev0 [gold=3,pred=1]: it 's a **lovely** film with **lovely** performances by buy and accorsi .
 dev1 [gold=2,pred=1]: **no** one goes **unindicted** here , **wh

Epoch 119 Iter 40600 loss=6.0622 elbo -2.4868 ll -1.4568 kl_fb 1.0300 kl 2.5368 kl_weight 0.4060 sf -0.0433 reward 0.0039 ll_mean -1.4607 ll_std 1.0000 selected 0.5255 prior_p1 0.7567 avg_p1 0.4974
Shuffling training data
Epoch 119 Iter 40700 loss=5.4523 elbo -5.1743 ll -1.4255 kl_fb 3.7488 kl 9.2108 kl_weight 0.4070 sf -0.4704 reward 0.0362 ll_mean -1.4617 ll_std 1.0000 selected 0.5163 prior_p1 0.8928 avg_p1 0.4952
Epoch 119 Iter 40800 loss=5.8021 elbo -1.4745 ll -1.4372 kl_fb 0.0374 kl 0.0916 kl_weight 0.4080 sf -0.2562 reward 0.0228 ll_mean -1.4600 ll_std 1.0000 selected 0.4987 prior_p1 0.4470 avg_p1 0.4959
Epoch 119 Iter 40900 loss=5.9218 elbo -9.8316 ll -1.4168 kl_fb 8.4148 kl 20.5740 kl_weight 0.4090 sf -0.5980 reward 0.0444 ll_mean -1.4612 ll_std 1.0000 selected 0.4984 prior_p1 0.9677 avg_p1 0.4958


RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorMath.cu:24