In [1]:
import os
import sys
path = os.path.join(os.path.abspath(os.curdir), '../src')
sys.path.append(path)

# Train

In [2]:
import os
import json
import time
import torch
import argparse
import numpy as np
from multiprocessing import cpu_count
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from collections import OrderedDict, defaultdict

from ptb import PTB
from utils import idx2word, experiment_name, AttributeDict
from models.model_kwbase import SentenceVAE
from models.model_utils import to_var
from glob import glob

In [3]:
top_dir = os.path.abspath('..')
runs_dir = f'{top_dir}/runs'
top_dir, runs_dir

('/Users/s07309/gdrive/src/ca_dev/Sentence-VAE',
 '/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs')

In [4]:
data_base_dir = f'{top_dir}/data/exp_20191220_kw2title'
data_name = 'ec.mediatag'
# data_name = 'hr.mediatag'
# data_name = 'finance.categorytag'
# data_name = 'finance.mediatag'
data_dir = f'{data_base_dir}/{data_name}'
data_dir

'/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/data/exp_20191220_kw2title/ec.mediatag'

In [5]:
log_dir, save_model_path = runs_dir, runs_dir
# log_dir = f'{runs_dir}/logs'
# save_model_path = f'{runs_dir}/models'
log_dir, save_model_path

('/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs',
 '/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs')

In [6]:
def readlines(path):
    with open(path, 'r') as f:
        return [s.replace('\n', '') for s in f.readlines()]

def cal_max_file_lines(path):
    lines = readlines(path)
    line_lengths = [len(line.split(' ')) for line in lines]
    return max(line_lengths)
    
src_max_length = cal_max_file_lines(f'{data_dir}/src/ptb.train.txt')
tgt_max_length = cal_max_file_lines(f'{data_dir}/tgt/ptb.train.txt')
print(f'src max: {src_max_length}, tgt max: {tgt_max_length}')

src max: 5, tgt max: 53


In [7]:
args = {
    'data_dir': data_dir,
    'create_data': False,
    'max_sequence_length': tgt_max_length,
    'max_sequence_length_src': src_max_length,
    'min_occ': 1,
    'test': False,

    'epochs': 10,
    'batch_size': 32,
    'learning_rate': 0.001,

    'embedding_size': 300,
    'rnn_type': 'gru',
    'hidden_size': 256,
    'num_layers': 1,
    'bidirectional': False,
    'latent_size': 16,
    'word_dropout': 0,
    'embedding_dropout': 0.5,

    'anneal_function': 'logistic',
    'k': 0.0025,
    'x0': 2500,

    'print_every': 50,
    'tensorboard_logging': True,
    'logdir': log_dir,
    'save_model_path': save_model_path,
    'experiment_name': f'kw2copy_{data_name}',
}

args = AttributeDict(args)

args.rnn_type = args.rnn_type.lower()
args.anneal_function = args.anneal_function.lower()

assert args.rnn_type in ['rnn', 'lstm', 'gru']
assert args.anneal_function in ['logistic', 'linear']
assert 0 <= args.word_dropout <= 1
args

<AttrDict{'data_dir': '/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/data/exp_20191220_kw2title/ec.mediatag', 'create_data': False, 'max_sequence_length': 53, 'max_sequence_length_src': 5, 'min_occ': 1, 'test': False, 'epochs': 10, 'batch_size': 32, 'learning_rate': 0.001, 'embedding_size': 300, 'rnn_type': 'gru', 'hidden_size': 256, 'num_layers': 1, 'bidirectional': False, 'latent_size': 16, 'word_dropout': 0, 'embedding_dropout': 0.5, 'anneal_function': 'logistic', 'k': 0.0025, 'x0': 2500, 'print_every': 50, 'tensorboard_logging': True, 'logdir': '/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs', 'save_model_path': '/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs', 'experiment_name': 'kw2copy_ec.mediatag'}>

## load data

In [8]:
%%time
import itertools
splits = ['train', 'valid'] + (['test'] if args.test else [])
datasets = OrderedDict()
print(f'loading {args.data_dir}')
for split, src_tgt in itertools.product(splits, ['src', 'tgt']):
    key = (split, src_tgt)
    print(key)
    datasets[key] = PTB(
        data_dir=f'{args.data_dir}/{src_tgt}',
        split=split,
        create_data=args.create_data,
        max_sequence_length=args.max_sequence_length if src_tgt == 'tgt' else args.max_sequence_length_src,
        min_occ=args.min_occ
    )
    print(f'vocab: {datasets[key].vocab_size}, records: {len(datasets[key].data)}')

loading /Users/s07309/gdrive/src/ca_dev/Sentence-VAE/data/exp_20191220_kw2title/ec.mediatag
('train', 'src')
vocab: 2642, records: 39317
('train', 'tgt')
vocab: 10214, records: 39317
('valid', 'src')
vocab: 2642, records: 150
('valid', 'tgt')
vocab: 10214, records: 150
CPU times: user 570 ms, sys: 59.8 ms, total: 630 ms
Wall time: 632 ms


In [9]:
# 実際のデータ確認
def ids2text(id_list, ptb):
    return ' '.join([ptb.i2w[f'{i}'] for i in id_list])

_ptb_src = datasets[('train', 'src')]
_ptb_tgt = datasets[('train', 'tgt')]
index = str(101)
_sample_src, _sample_tgt = _ptb_src.data[index], _ptb_tgt[index]
print(f'■ src-input \n{ids2text(_sample_src["input"], _ptb_src)}')
print(f'■ src-target \n{ids2text(_sample_src["target"], _ptb_src)}')
print(f'■ tgt-input\n{ids2text(_sample_tgt["input"], _ptb_tgt)}')
print(f'■ tgt-target\n{ids2text(_sample_tgt["target"], _ptb_tgt)}')

■ src-input 
<sos> other <sgltag> 青汁 話題
■ src-target 
other <sgltag> 青汁 話題 <eos>
■ tgt-input
<sos> 美味し すぎ と 話題 の フルーツ 青汁 ♪ ごくごく 飲み たい ほど 本当 に おいしい ん です ! <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
■ tgt-target
美味し すぎ と 話題 の フルーツ 青汁 ♪ ごくごく 飲み たい ほど 本当 に おいしい ん です ! <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


## build model

In [10]:
from ptb import SOS_INDEX, EOS_INDEX, PAD_INDEX, UNK_INDEX

In [11]:
%load_ext autoreload

In [12]:
%autoreload
model = SentenceVAE(
    vocab_size=datasets[('train', 'src')].vocab_size,
    sos_idx=SOS_INDEX,
    eos_idx=EOS_INDEX,
    pad_idx=PAD_INDEX,
    unk_idx=UNK_INDEX,
    max_sequence_length=args.max_sequence_length,
    embedding_size=args.embedding_size,
    rnn_type=args.rnn_type,
    hidden_size=args.hidden_size,
    word_dropout=args.word_dropout,
    embedding_dropout=args.embedding_dropout,
    latent_size=args.latent_size,
    num_layers=args.num_layers,
    bidirectional=args.bidirectional,
    
    # bow loss
    # bow_hidden_size=256,
    use_bow_loss=False,
    
    # kw base
    out_vocab_size=datasets[('train', 'tgt')].vocab_size,
    )

if torch.cuda.is_available():
    model = model.cuda()

In [13]:
model

SentenceVAE(
  (embedding): Embedding(2642, 300)
  (embedding_dropout): Dropout(p=0.5, inplace=False)
  (decoder_embedding): Embedding(10214, 300)
  (encoder_rnn): GRU(300, 256, batch_first=True)
  (decoder_rnn): GRU(300, 256, batch_first=True)
  (hidden2mean): Linear(in_features=256, out_features=16, bias=True)
  (hidden2logv): Linear(in_features=256, out_features=16, bias=True)
  (latent2hidden): Linear(in_features=16, out_features=256, bias=True)
  (outputs2vocab): Linear(in_features=256, out_features=10214, bias=True)
)

## log

In [14]:
def get_meta_model_dict(model, args):
    meta_dict = {k:v for k, v in model.__dict__.items() if not k[0] == '_'}
    meta_dict.update(args.obj)
    return meta_dict

In [15]:
print(f'tensorboard logging: {args.tensorboard_logging}')
ts = time.strftime('%Y-%m-%d-%H%M%S', time.localtime())
exp_name = experiment_name(args,ts)

if args.tensorboard_logging:
    writer_path = os.path.join(args.logdir, exp_name)
    writer = SummaryWriter(writer_path)
    writer.add_text("model", str(model))
    writer.add_text("args", str(args))
    writer.add_text("ts", ts)
    print(f'▼tensorboard logging\n{writer_path}')
    
save_model_path = os.path.join(args.save_model_path, exp_name, 'models')
os.makedirs(save_model_path, exist_ok=True)
print(f'▼ model save\n{save_model_path}')

# メタパラメータ保存
with open(os.path.join(save_model_path, 'model_meta.json'), 'w') as f:
    meta_dict = get_meta_model_dict(model, args)
    meta_dict.pop('tensor')
    json.dump(meta_dict, f)

tensorboard logging: True
▼tensorboard logging
/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs/kw2copy_ec.mediatag_TS=2019-12-25-151634_BS=32_LR=0.001_EB=300_GRU_HS=256_L=1_BI=0_LS=16_WD=0_ANN=LOGISTIC_K=0.0025_X0=2500
▼ model save
/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs/kw2copy_ec.mediatag_TS=2019-12-25-151634_BS=32_LR=0.001_EB=300_GRU_HS=256_L=1_BI=0_LS=16_WD=0_ANN=LOGISTIC_K=0.0025_X0=2500/models


## optimizer

In [16]:
optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.Tensor
step = 0

In [17]:
datasets.keys()

odict_keys([('train', 'src'), ('train', 'tgt'), ('valid', 'src'), ('valid', 'tgt')])

In [18]:
ae_datasets = {split: dataset for (split, src_tgt), dataset in datasets.items() if src_tgt == 'tgt'}
print(ids2text(ae_datasets['train'][0]['input'], ae_datasets['train']))
print(ids2text(ae_datasets['train'][0]['target'], ae_datasets['train']))

<sos> 息 スッキリ ! 口臭 サプリ が 凄い <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
息 スッキリ ! 口臭 サプリ が 凄い <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [19]:
_datasets = {}
for split in splits:
    src_dataset = datasets[(split, 'src')]
    tgt_dataset = datasets[(split, 'tgt')]
    assert len(src_dataset) == len(tgt_dataset)
    dataset = []
    for i in range(len(src_dataset)):
        src_set, tgt_set = src_dataset[i], tgt_dataset[i]
        _data = {}
        _data.update({f'src_{k}': v for k,v in src_set.items()})
        _data.update({f'tgt_{k}': v for k,v in tgt_set.items()})
        dataset.append(_data)
    _datasets[split] = dataset

In [20]:
_datasets.keys()

dict_keys(['train', 'valid'])

In [21]:
_datasets['train'][0]

{'src_input': array([2, 4, 5, 6, 7]),
 'src_target': array([4, 5, 6, 7, 3]),
 'src_length': 5,
 'tgt_input': array([ 2,  4,  5,  6,  7,  8,  9, 10,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]),
 'tgt_target': array([ 4,  5,  6,  7,  8,  9, 10,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]),
 'tgt_length': 8}

In [22]:
train_target_ptb = datasets[('train', 'tgt')]
train_target_ptb

<ptb.PTB at 0x1a23c07c50>

In [23]:
from utils import ids2ptext

In [24]:
# %pdb on
for epoch in range(args.epochs):

    for split in splits:
        
        data_loader = DataLoader(
            dataset=_datasets[split],
            batch_size=args.batch_size,
            shuffle=split=='train',
            num_workers=cpu_count(),
            pin_memory=torch.cuda.is_available()
        )

        tracker = defaultdict(tensor)

        # Enable/Disable Dropout
        if split == 'train':
            model.train()
        else:
            model.eval()

        for iteration, batch in enumerate(data_loader):
            
            batch_size = batch['src_input'].size(0)
            
            for k, v in batch.items():
                if torch.is_tensor(v):
                    batch[k] = to_var(v)
            
            # loss calculation
            cal_dict = model(batch['src_input'], batch['src_length'], batch['tgt_input'], batch['tgt_length'])
            logp, mean, logv, z = cal_dict['logp'], cal_dict['mean'], cal_dict['logv'], cal_dict['z']
            
            loss_dict = model.loss(logp, batch['tgt_target'], batch['tgt_length'], mean, logv, args.anneal_function, step, args.k, args.x0, bow_input=z)
            loss, NLL_loss, KL_weight, KL_loss, avg_bow_loss = loss_dict['loss'], loss_dict['NLL_loss'], loss_dict['KL_weight'], loss_dict['KL_loss'], loss_dict.get('avg_bow_loss')

            # backward + optimization
            if split == 'train':
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                step += 1

            # bookkeepeing
            tracker['ELBO'] = torch.cat((tracker['ELBO'], loss.data.view(1)))

            if args.tensorboard_logging:
                writer.add_scalar("%s/ELBO"%split.upper(), loss.data.item(), epoch*len(data_loader) + iteration)
                writer.add_scalar("%s/NLL Loss"%split.upper(), NLL_loss.data.item()/batch_size, epoch*len(data_loader) + iteration)
                writer.add_scalar("%s/KL Loss"%split.upper(), KL_loss.data.item()/batch_size, epoch*len(data_loader) + iteration)
                writer.add_scalar("%s/KL Weight"%split.upper(), KL_weight, epoch*len(data_loader) + iteration)
                if avg_bow_loss is not None:
                    writer.add_scalar("%s/BOW Loss"%split.upper(), avg_bow_loss, epoch*len(data_loader) + iteration)

            if iteration % args.print_every == 0 or iteration+1 == len(data_loader):
                print_text = "%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f"%(split.upper(), iteration, len(data_loader)-1, loss.data.item(), NLL_loss.data.item()/batch_size, KL_loss.data.item()/batch_size, KL_weight)
                if avg_bow_loss is not None:
                    print_text += ', BOW Loss %9.4f,'%(avg_bow_loss)
                print(print_text)

            if split == 'valid':
                if 'target_sents' not in tracker:
                    tracker['target_sents'] = list()
                tracker['target_sents'] += idx2word(batch['tgt_target'].data, i2w=train_target_ptb.get_i2w(), pad_idx=PAD_INDEX)
                tracker['z'] = torch.cat((tracker['z'], z.data), dim=0)

        print("%s Epoch %02d/%i, Mean ELBO %9.4f"%(split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO'])))

        if args.tensorboard_logging:
            writer.add_scalar("%s-Epoch/ELBO"%split.upper(), torch.mean(tracker['ELBO']), epoch)


        # NOTE: dump. tensorboardで管理すればいいと思う
        # save a dump of all sentences and the encoded latent space
        # if split == 'valid':
        #    dump = {'target_sents':tracker['target_sents'], 'z':tracker['z'].tolist()}
        #    if not os.path.exists(os.path.join('dumps', ts)):
        #        os.makedirs('dumps/'+ts)
        #    with open(os.path.join('dumps/'+ts+'/valid_E%i.json'%epoch), 'w') as dump_file:
        #        json.dump(dump,dump_file)

        
        # TODO: テストデータのログ
        # if split == 'test':
            # 1. テストデータ全体に対してデコード
            # 2. 各種指標の計測
            # 3. デコードサンプルをtensorboard TEXTに保存
            # decoded_ids, _ = model.inference(z=z)
            # decoded_texts = [ids2ptext(text_ids.squeeze(), train_target_ptb.i2w) for text_ids in decoded_ids]

        # save checkpoint
        if split == 'train':
            checkpoint_path = os.path.join(save_model_path, f"model_E{epoch}.pytorch")
            torch.save(model.state_dict(), checkpoint_path)
            print("Model saved at %s"%checkpoint_path)

TRAIN Batch 0000/1228, Loss  169.8769, NLL-Loss  169.8762, KL-Loss    0.3722, KL-Weight  0.002
TRAIN Batch 0050/1228, Loss  114.7346, NLL-Loss  114.6901, KL-Loss   20.3569, KL-Weight  0.002
TRAIN Batch 0100/1228, Loss  109.6969, NLL-Loss  109.6292, KL-Loss   27.3648, KL-Weight  0.002
TRAIN Batch 0150/1228, Loss  120.6653, NLL-Loss  120.5855, KL-Loss   28.4718, KL-Weight  0.003


Traceback (most recent call last):
  File "/Users/s07309/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/Users/s07309/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/Users/s07309/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/Users/s07309/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


KeyboardInterrupt: 

- TEST時にやりたいこと
    - 評価指標による計測
    - デコードをTEXTに保存
- 評価指標の追加：TrainingのTEST時に評価
    - BLEU
    - Distinct-1, 2, full
    - Train_contains_decode