In [1]:
import torch
from torch.nn import Conv1d

In [2]:
'''
1D convolutional neural network
word_embed_size: number of word embedding features ('channels')
seq_len: number of time steps/words
out_channels: number of filters to apply
kernel size: filter window size
padding: amount of zero padding to apply to the sequence

max: max pooling over time
'''

batch_size = 16
word_embed_size = 4
seq_len = 7
input = torch.randn(batch_size, word_embed_size, seq_len)
conv1 = Conv1d(in_channels=word_embed_size,
               out_channels=4,
               kernel_size=3,
               #padding=1
               )
hidden1 = conv1(input)
hidden2 = torch.max(hidden1, dim=2)

In [3]:
print(input.shape)
print(hidden1.shape)
hidden2

torch.Size([16, 4, 7])
torch.Size([16, 4, 5])


torch.return_types.max(
values=tensor([[ 0.8073,  0.7392,  0.6410,  0.4973],
        [ 1.0437,  1.9393,  0.5352,  0.5954],
        [ 1.5676,  1.0931,  0.4885,  0.5392],
        [ 1.0613,  1.5563,  0.4245,  0.9570],
        [ 0.7449,  1.3366,  1.1261,  1.2232],
        [ 0.1474,  0.5716,  0.9041,  0.2372],
        [ 1.3320,  0.3865,  0.2761,  0.7339],
        [ 0.2837,  0.8003,  0.7493,  1.2046],
        [ 0.4842,  0.9318,  0.1813,  0.7009],
        [ 1.1912,  1.1614,  0.1312,  0.4134],
        [-0.0257,  0.7380,  0.8102,  0.0210],
        [ 1.5116,  2.2454,  0.5355,  0.3664],
        [ 1.0494,  0.9925,  1.0557,  0.1491],
        [ 0.9173,  1.2709,  0.5128,  0.4038],
        [ 1.0289,  1.5024,  0.9633,  0.6542],
        [ 0.5454,  2.0138,  1.3261,  0.5549]], grad_fn=<MaxBackward0>),
indices=tensor([[3, 4, 3, 2],
        [3, 2, 4, 1],
        [4, 0, 0, 3],
        [0, 3, 1, 3],
        [3, 4, 3, 0],
        [2, 4, 1, 3],
        [4, 2, 1, 3],
        [0, 3, 2, 4],
        [4, 1, 4, 0],
 

Applies a 1D convolution over an input signal composed of several input
planes.
In the simplest case, the output value of the layer with input size
$`(N, C_{\text{in}}, L)`$ and output $`(N, C_{\text{out}}, L_{\text{out}})`$ can be
precisely described as:
$
\text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
\sum_{k = 0}^{C_{in} - 1} \text{weight}(C_{\text{out}_j}, k)
\star \text{input}(N_i, k)
$
where :math:`\star` is the valid `cross-correlation`_ operator,
:math:`N` is a batch size, :math:`C` denotes a number of channels,
:math:`L` is a length of signal sequence.

In [4]:
from typing import List
import numpy as np
def generate_sent_masks(enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size. 
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks

In [5]:
b = 5 # batch len
h = 4 # hidden size

source_lengths = torch.from_numpy(np.random.randint(4,20, (b,)))
enc_hiddens = torch.Tensor(b, max(source_lengths).item(), 2*h)
generate_sent_masks(enc_hiddens, source_lengths).shape

torch.Size([5, 18])

In [6]:
print(f"the original sentence lengths are: {(source_lengths)}")
print(f"the longest sentence in the batch is {max(source_lengths).item()} words")

the original sentence lengths are: tensor([15, 13,  5, 18,  7])
the longest sentence in the batch is 18 words


- initialize mask matrix to batch size x longest sentence
- row of mask matrix = position of the input sentence in the list
- columns of the row beyond the length of the sentence are set to 1 (masks), because we have padded them to the length of the longest sentence

In [7]:
generate_sent_masks(enc_hiddens, source_lengths)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

BLEU score:
$$ 
\dfrac{\sum_{\text{ngram} \in c}  min \Big(max_{i=1,...,k} \text{Count}_{r_{i}}(\text{ngram}), \text{Count}_c(\text{ngram})\Big)}{\sum_{\text{ngram}\in c} \text{Count}_c (\text{ngram})}
$$

In [16]:
from fractions import Fraction
from nltk.util import ngrams

In [9]:
def bleu(refs,
         hypo,
         weights=(0.25,0.25,0.25,0.25),
):
    '''
    @param refs: list of reference translations
    @param hypo: a hypothesis machine translation
    '''

def precision(refs, hypo, n):
    '''
    calculate the ngram precision.
    reference word is "exhausted" after
    the first matching hypothesis.
    '''
    # count ngrams in hypo
    hypo_counts = Counter(ngrams(hypo, n) if len(hypo) >= n else Counter()

    # max ngram count over all refs for ngram = n
    # iterate over each ngram in each reference
    max_counts = {}
    for ref in refs:
        ref_counts = Counter(ngrams(ref, n)) if len(ref) >= n else Counter()    
        for ngram in counts:
            max_counts[ngram] = max(max_counts.get(ngram, 0), ref_counts[ngram])
    
    # min of ngrams in max ref, ngrams in hypo
    # iterate over ngrams in hypo
    min_max = {ngram: min(max_counts[ngram], count) for ngram, count in hypo_counts.items()}

    # sum over all the ngrams
    numerator = sum(min_max.values())
    denominator = max(1, sum(hypo_counts.values))

    return Fraction(numerator, denominator, _normalize=False)

def brevity_penalty(ref_lens, hypo_lens):
    '''
    if hypo is shorter than references, reduce score
    '''

SyntaxError: unexpected EOF while parsing (<ipython-input-9-e965b219ae05>, line 5)

In [36]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

In [37]:
s = ["el amor todo lo puede".split()]
r_1 = ["love can always find a way".split()]
r_2 =["love makes anything possible".split()]
c_1 = "the love can always do".split()
c_2 = "love can make anything possible".split()

In [87]:
from itertools import product
refs = [r_1, r_2]
hypos = [c_1, c_2]

for i in product(refs, hypos):
    a, b = i
    print(i)
    print(sentence_bleu(a, b, weights=(0.5, 0.5)))
#print(corpus_bleu(refs, hypos, weights=(0.5, 0.5)))

([['love', 'can', 'always', 'find', 'a', 'way']], ['the', 'love', 'can', 'always', 'do'])
0.448437301984003
([['love', 'can', 'always', 'find', 'a', 'way']], ['love', 'can', 'make', 'anything', 'possible'])
0.25890539701513365
([['love', 'makes', 'anything', 'possible']], ['the', 'love', 'can', 'always', 'do'])
6.6709427497276e-155
([['love', 'makes', 'anything', 'possible']], ['love', 'can', 'make', 'anything', 'possible'])
0.3872983346207417


In [92]:
from collections import defaultdict
scores = defaultdict(list)
for i, h in enumerate(hypos):
    for r in refs:
        scores[f'c_{i+1}'].append(sentence_bleu(r, h, weights=(0.5, 0.5)))
sum_scores = {k: sum(v) for k,v in scores.items()}
print(max(sum_scores))

c_2


In [83]:
print(len(set("love can always find a way".split()).intersection(set("the love can always do".split()))))
print(len(set("love can always find a way".split()).intersection(set("love can make anything possible".split()))))

3
2
