In [1]:
# torch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data import Field, Iterator, BucketIterator, TabularDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from decoder import Decoder
from encoder import Encoder
from attention import Attention
from train import evaluation
# others
import argparse
from argparser import get_parser
import random
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.cuda.current_device()
# USE_CUDA = False
# DEVICE = -1

config = get_parser()
SOURCE = Field(tokenize=str.split, use_vocab=True, init_token="<s>", eos_token="</s>", lower=True, 
                   include_lengths=True, batch_first=True)
TARGET = Field(tokenize=str.split, use_vocab=True, init_token="<s>", eos_token="</s>", lower=True, 
               batch_first=True)

train_data, valid_data, test_data = \
        TabularDataset.splits(path=config.PATH, format='tsv', train=config.TRAIN_FILE, \
                              validation=config.VALID_FILE, test=config.TEST_FILE, \
                              fields=[('so', SOURCE), ('ta', TARGET)])

SOURCE.build_vocab(train_data)
TARGET.build_vocab(train_data)

test_loader = BucketIterator(test_data, batch_size=config.BATCH, device=DEVICE,
                              sort_key=lambda x: len(x.so), sort_within_batch=True, repeat=False)

# vocaburary
V_so = len(SOURCE.vocab)
V_ta = len(TARGET.vocab)

print('train data: {}, valid_data: {}, test_data: {}'.format(len(train_data), len(valid_data), len(test_data)))
print('source language vocab: {}, target language vocab: {}'.format(V_so, V_ta))

train data: 47500, valid_data: 1250, test_data: 1250
source language vocab: 5737, target language vocab: 9989


In [6]:
def build(config):
    enc = Encoder(V_so, config.EMBED, config.HIDDEN, config.NUM_HIDDEN, bidrec=True)
    dec = Decoder(V_ta, config.EMBED, 2*config.HIDDEN, hidden_size2=config.HIDDEN2, \
                  sos_idx=SOURCE.vocab.stoi['<s>'], method='concat')
    if USE_CUDA:
        enc = enc.cuda()
        dec = dec.cuda()

    loss_function = nn.CrossEntropyLoss(ignore_index=TARGET.vocab.stoi['<pad>'])
    return enc, dec, loss_function

In [16]:
i = 4
enc, dec, loss_function = build(config)
enc_model_path = './data/model/fra_eng{}.enc'.format(i)
dec_model_path = './data/model/fra_eng{}.dec'.format(i)
enc.load_state_dict(torch.load(enc_model_path))
dec.load_state_dict(torch.load(dec_model_path))

RuntimeError: Error(s) in loading state_dict for Decoder:
	Missing key(s) in state_dict: "attention.v". 
	While copying the parameter named "attention.attn.weight", whose dimensions in the model are torch.Size([1200, 2400]) and whose dimensions in the checkpoint are torch.Size([1200, 1200]).

In [7]:
enc, dec, loss_function = build(config)

In [8]:
for batch in test_loader:
    break

In [9]:
inputs, lengths = batch.so
targets = batch.ta

enc.zero_grad()
dec.zero_grad()

In [10]:
output, hidden = enc(inputs, lengths.tolist())


In [11]:
preds, _ = dec(hidden, output, lengths.tolist(), targets.size(1)) # max_len

In [13]:
preds.size()

torch.Size([768, 9989])

In [14]:
loss = loss_function(preds, targets.view(-1))

In [15]:
loss

tensor(9.1954, device='cuda:0')

In [4]:
import re
import unicodedata
def unicode_to_ascii(s):
    return ''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' )

In [5]:
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([,.!?])", r" \1 ", s)
    s = re.sub(r"[^a-zA-Z,.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [16]:
s = 'it only costs!'

In [17]:
normalize_string(s)

'it only costs !'

In [5]:
with open('./data/en_fa/eng-fra-filtered.test', 'r', encoding='utf-8') as file:
    data = file.read().splitlines()