In [1]:
with open("/home/starfish/pattern_reg/data/english_to_french.txt", mode='r', encoding='utf') as f:
    lines = f.readlines(1000)

In [2]:
pairs = [line.split("\t") for line in lines]

In [3]:
from nltk import wordpunct_tokenize

In [4]:
def tokenize(text):
    text = text.lower()
    # split
    tokens = wordpunct_tokenize(text)
    # keep only words
    tokens = [token for token in tokens 
            if all(char.isalpha() for char in token)]
    return tokens

In [5]:
src, target = [tokenize(pairs[i][0]) for i in range(len(pairs))], [tokenize(pairs[i][1]) for i in range(len(pairs))]

In [2]:
a = 'com string-dsad'
a.split().split("-")

AttributeError: 'list' object has no attribute 'split'

In [6]:
# special tokens
padding_token = '<PAD>'
start_of_sequence_token = '<SOS>'
end_of_sequence_token = '<EOS>'
unknown_word_token = '<UNK>'
vac_size = 1000

In [7]:
from collections import Counter

In [8]:
def build_vac(src, padding_token, start_of_sequence_token, end_of_sequence_token, unknown_word_token, vac_size=30):
    flatten = lambda x: [sublst for lst in x for sublst in lst]
    token_set = flatten(src)
    commons = Counter(token_set).most_common(vac_size - 4)
    token2idx = {commons[i][0]: (i + 4) for i in range(len(commons))}
    token2idx.update({unknown_word_token:0, start_of_sequence_token:1, end_of_sequence_token:2, padding_token:3})
    idx2token = {value: key for key, value in token2idx.items()}

    return token2idx, idx2token

In [9]:
src_token2idx, src_idx2token = build_vac(src, padding_token, start_of_sequence_token, end_of_sequence_token, unknown_word_token)
tar_token2idx, tar_idx2token = build_vac(target, padding_token, start_of_sequence_token, end_of_sequence_token, unknown_word_token)

In [10]:
def replace_rare_tokens(src, token2idx, unknown_word_token):
    for i in range(len(src)):
        for j in range(len(src[i])):
            if src[i][j] not in token2idx:
                src[i][j] = unknown_word_token
    return src

In [11]:
src = replace_rare_tokens(src, src_token2idx, unknown_word_token)
target = replace_rare_tokens(target, tar_token2idx, unknown_word_token)

In [12]:
def remove_most_unk(threshold, src, target):
    idx = []
    src_ratio = [sentence.count("<UNK>") / len(sentence) for sentence in src]
    tar_ration = [sentence.count("<UNK>") / len(sentence) for sentence in target]
    for i in range(len(src)):
        if src_ratio[i] < threshold and tar_ration[i] < threshold:
            idx.append(i)
    src_out = [src[_] for _ in idx]
    target_out = [target[_] for _ in idx]
    return src_out, target_out
src, target = remove_most_unk(0.3, src, target)

In [13]:
def add_start_and_end(src, start_of_sequence_token, end_of_sequence_token):
    for i in range(len(src)):
        src[i] = [start_of_sequence_token] + src[i] + [end_of_sequence_token]
    return src 


In [14]:
src = add_start_and_end(src, start_of_sequence_token, end_of_sequence_token)
target = add_start_and_end(target, start_of_sequence_token, end_of_sequence_token)

In [15]:
src, target

([['<SOS>', 'go', '<EOS>'],
  ['<SOS>', 'run', '<EOS>'],
  ['<SOS>', 'run', '<EOS>'],
  ['<SOS>', 'wow', '<EOS>'],
  ['<SOS>', 'fire', '<EOS>'],
  ['<SOS>', 'i', 'won', '<EOS>'],
  ['<SOS>', 'cheers', '<EOS>'],
  ['<SOS>', 'cheers', '<EOS>'],
  ['<SOS>', 'got', 'it', '<EOS>'],
  ['<SOS>', 'got', 'it', '<EOS>'],
  ['<SOS>', 'got', 'it', '<EOS>'],
  ['<SOS>', 'got', 'it', '<EOS>'],
  ['<SOS>', 'i', 'm', 'ok', '<EOS>'],
  ['<SOS>', 'no', 'way', '<EOS>']],
 [['<SOS>', 'va', '<EOS>'],
  ['<SOS>', 'cours', '<EOS>'],
  ['<SOS>', 'courez', '<EOS>'],
  ['<SOS>', 'ça', 'alors', '<EOS>'],
  ['<SOS>', 'au', 'feu', '<EOS>'],
  ['<SOS>', 'je', 'l', 'ai', '<UNK>', '<EOS>'],
  ['<SOS>', 'santé', '<EOS>'],
  ['<SOS>', 'tchin', 'tchin', '<EOS>'],
  ['<SOS>', 'j', 'ai', 'pigé', '<EOS>'],
  ['<SOS>', 'compris', '<EOS>'],
  ['<SOS>', 'pigé', '<EOS>'],
  ['<SOS>', 'compris', '<EOS>'],
  ['<SOS>', 'ça', 'va', '<EOS>'],
  ['<SOS>', 'c', 'est', 'pas', '<UNK>', '<EOS>']])

In [16]:
def tokens_to_idx(src, token2idx):
    for i in range(len(src)):
        for j in range(len(src[i])):
            src[i][j] = token2idx[src[i][j]]
    return src
src = tokens_to_idx(src, src_token2idx)
target = tokens_to_idx(target, tar_token2idx) 


In [17]:
src

[[1, 23, 2],
 [1, 12, 2],
 [1, 12, 2],
 [1, 24, 2],
 [1, 25, 2],
 [1, 4, 14, 2],
 [1, 9, 2],
 [1, 9, 2],
 [1, 7, 8, 2],
 [1, 7, 8, 2],
 [1, 7, 8, 2],
 [1, 7, 8, 2],
 [1, 4, 11, 22, 2],
 [1, 5, 6, 2]]

In [43]:
def allocate_fn():
    

{'as': '1', 'aas': '22'}

In [33]:
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import Dataset, DataLoader


In [42]:
def collate_fn(batch):
    inputs = [torch.LongTensor(item[0]) for item in batch]
    targets = [torch.LongTensor(item[1]) for item in batch]
    
    # Pad sequencse so that they are all the same length (within one minibatch)
    padded_inputs = pad_sequence(inputs, padding_value=dataset.padding_token_value, batch_first=True)
    padded_targets = pad_sequence(targets, padding_value=dataset.padding_token_value, batch_first=True)
    
    # Sort by length for CUDA optimizations
    lengths = torch.LongTensor([len(x) for x in inputs])
    lengths, permutation = lengths.sort(dim=0, descending=True)

    # lengths is the len of src
    return padded_inputs[permutation], padded_targets[permutation], lengths


In [45]:
class TranslationData(Dataset):
    def __init__(self, src, target, padding_token_value):
        self.src = src
        self.target = target
        self.padding_token_value = padding_token_value
        
    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return self.src[idx], self.target[idx]

dataset = TranslationData(src, target, 3)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [46]:
next(iter(dataloader))

(tensor([[ 1,  4, 11, 22,  2],
         [ 1,  7,  8,  2,  3],
         [ 1,  5,  6,  2,  3],
         [ 1,  7,  8,  2,  3],
         [ 1,  7,  8,  2,  3],
         [ 1,  7,  8,  2,  3],
         [ 1,  4, 14,  2,  3],
         [ 1, 24,  2,  3,  3],
         [ 1, 12,  2,  3,  3],
         [ 1, 23,  2,  3,  3],
         [ 1, 25,  2,  3,  3],
         [ 1,  9,  2,  3,  3],
         [ 1, 12,  2,  3,  3],
         [ 1,  9,  2,  3,  3]]),
 tensor([[ 1,  9, 12,  2,  3,  3],
         [ 1, 19,  2,  3,  3,  3],
         [ 1, 10,  8, 23,  0,  2],
         [ 1, 19,  2,  3,  3,  3],
         [ 1,  5,  6, 18,  2,  3],
         [ 1, 18,  2,  3,  3,  3],
         [ 1,  4, 14,  6,  0,  2],
         [ 1,  9, 27,  2,  3,  3],
         [ 1, 25,  2,  3,  3,  3],
         [ 1, 12,  2,  3,  3,  3],
         [ 1, 28, 29,  2,  3,  3],
         [ 1, 17, 17,  2,  3,  3],
         [ 1, 26,  2,  3,  3,  3],
         [ 1, 16,  2,  3,  3,  3]]),
 tensor([5, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3]))

In [1]:
import torch
from torch import nn

In [7]:
model = nn.GRU(input_size=128, hidden_size=256, batch_first=True)

In [8]:
x = torch.randn(8, 1, 128)

In [9]:
init_hidden = torch.randn(1, 8, 256)

In [11]:
output, state = model(x, init_hidden)

In [12]:
output.shape

torch.Size([8, 1, 256])

In [13]:
state.shape

torch.Size([1, 8, 256])