# POSベースモデル
![Group 112](https://user-images.githubusercontent.com/17490886/71764668-24c34700-2f2e-11ea-94d2-e4e82c3bffd4.png)

In [1]:
# ! gsutil -m rsync -d -r gs://kawamoto-ramiel/experiments_v3_pos_20200104/data ../data/eccos_v2/

In [2]:
import init

# Train

In [3]:
import os
import json
import time
import torch
import argparse
import numpy as np
from multiprocessing import cpu_count
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from collections import OrderedDict, defaultdict

from ptb import PTB
from utils import idx2word, experiment_name, AttributeDict
from models.model_pos import POSVAE
from models.model_utils import to_var
from glob import glob

In [4]:
top_dir = os.path.abspath('..')
runs_dir = f'{top_dir}/runs'
print(f'top_dir: {top_dir}\nruns_dir: {runs_dir}')

top_dir: /Users/s07309/gdrive/src/ca_dev/Sentence-VAE
runs_dir: /Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs


In [5]:
data_base_dir = f'{top_dir}/data'
data_name = 'eccos_v2'
data_dir = f'{data_base_dir}/{data_name}'
data_dir

'/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/data/eccos_v2'

In [6]:
log_dir, save_model_path = runs_dir, runs_dir
log_dir, save_model_path

('/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs',
 '/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs')

In [7]:
def readlines(path):
    with open(path, 'r') as f:
        return [s.replace('\n', '') for s in f.readlines()]

def cal_max_file_lines(path):
    lines = readlines(path)
    line_lengths = [len(line.split(' ')) for line in lines]
    return max(line_lengths)
    
src_max_length = cal_max_file_lines(f'{data_dir}/src/ptb.train.txt')
tgt_max_length = cal_max_file_lines(f'{data_dir}/tgt/ptb.train.txt')
pos_max_length = cal_max_file_lines(f'{data_dir}/pos/ptb.train.txt')
print(f'src max: {src_max_length}, tgt max: {tgt_max_length}, pos max: {pos_max_length}')

src max: 51, tgt max: 51, pos max: 48


In [8]:
args = {
    'data_dir': data_dir,
    'create_data': False,
    'src_max_sequence_length': src_max_length,
    'tgt_max_sequence_length': tgt_max_length,
    'pos_max_sequence_length': pos_max_length,
    
    'min_occ': 1,
    'test': True,

    'epochs': 10,
    'batch_size': 32,
    'learning_rate': 0.001,
    
    'embedding_size': 300,
    'pos_embedding_size': 20,
    'rnn_type': 'gru',
    'hidden_size': 256,
    'num_layers': 1,
    'bidirectional': False,
    'latent_size': 16,
    'word_dropout': 0,
    'embedding_dropout': 0.5,

    'anneal_function': 'logistic',
    'k': 0.0025,
    'x0': 2500,

    'print_every': 50,
    'tensorboard_logging': True,
    'logdir': log_dir,
    'save_model_path': save_model_path,
    'experiment_name': f'posvae_{data_name}',
    
    'debug': False,
}

args = AttributeDict(args)

args.rnn_type = args.rnn_type.lower()
args.anneal_function = args.anneal_function.lower()

assert args.rnn_type in ['rnn', 'lstm', 'gru']
assert args.anneal_function in ['logistic', 'linear']
assert 0 <= args.word_dropout <= 1
args

<AttrDict{'data_dir': '/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/data/eccos_v2', 'create_data': False, 'src_max_sequence_length': 51, 'tgt_max_sequence_length': 51, 'pos_max_sequence_length': 48, 'min_occ': 1, 'test': True, 'epochs': 10, 'batch_size': 32, 'learning_rate': 0.001, 'embedding_size': 300, 'pos_embedding_size': 20, 'rnn_type': 'gru', 'hidden_size': 256, 'num_layers': 1, 'bidirectional': False, 'latent_size': 16, 'word_dropout': 0, 'embedding_dropout': 0.5, 'anneal_function': 'logistic', 'k': 0.0025, 'x0': 2500, 'print_every': 50, 'tensorboard_logging': True, 'logdir': '/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs', 'save_model_path': '/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs', 'experiment_name': 'posvae_eccos_v2', 'debug': False}>

## load data

In [9]:
%%time
import itertools
splits = ['train', 'valid'] + (['test'] if args.test else [])
data_types = ['src', 'tgt', 'pos']
datasets = OrderedDict()
print(f'loading {args.data_dir}')
for split, src_tgt in itertools.product(splits, data_types):
    key = (split, src_tgt)
    print(key)
    datasets[key] = PTB(
        data_dir=f'{args.data_dir}/{src_tgt}',
        split=split,
        create_data=args.create_data,
        max_sequence_length=args.obj[f'{src_tgt}_max_sequence_length'],
        min_occ=args.min_occ
    )
    print(f'vocab: {datasets[key].vocab_size}, records: {len(datasets[key].data)}')

loading /Users/s07309/gdrive/src/ca_dev/Sentence-VAE/data/eccos_v2
('train', 'src')
vocab: 10293, records: 44596
('train', 'tgt')
vocab: 10293, records: 44596
('train', 'pos')
vocab: 16, records: 44596
('valid', 'src')
vocab: 10293, records: 2477
('valid', 'tgt')
vocab: 10293, records: 2477
('valid', 'pos')
vocab: 16, records: 2477
('test', 'src')
vocab: 10293, records: 2479
('test', 'tgt')
vocab: 10293, records: 2479
('test', 'pos')
vocab: 16, records: 2479
CPU times: user 1.72 s, sys: 116 ms, total: 1.83 s
Wall time: 1.84 s


In [10]:
# 実際のデータ確認
def ids2text(id_list, ptb):
    return ' '.join([ptb.i2w[f'{i}'] for i in id_list])

_ptb_src = datasets[('train', 'src')]
_ptb_pos = datasets[('train', 'pos')]
_ptb_tgt = datasets[('train', 'tgt')]
index = str(100)
_sample_src, _sample_tgt, _sample_pos = _ptb_src[index], _ptb_tgt[index], _ptb_pos[index]
print(f'------- src --------')
print(f'■ src-input \n{ids2text(_sample_src["input"], _ptb_src)}')
print(f'■ src-target \n{ids2text(_sample_src["target"], _ptb_src)}')
print(f'------- pos --------')
print(f'■ pos-input \n{ids2text(_sample_pos["input"], _ptb_pos)}')
print(f'■ pos-target \n{ids2text(_sample_pos["target"], _ptb_pos)}')
print(f'------- tgt --------')
print(f'■ tgt-input\n{ids2text(_sample_tgt["input"], _ptb_tgt)}')
print(f'■ tgt-target\n{ids2text(_sample_tgt["target"], _ptb_tgt)}')

------- src --------
■ src-input 
<sos> 特別 な ケア を ルルルン で ! <sep> web 限定 の セット で しっかり お 顔 の 隅々 に まで アプローチ <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
■ src-target 
特別 な ケア を ルルルン で ! <sep> web 限定 の セット で しっかり お 顔 の 隅々 に まで アプローチ <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
------- pos --------
■ pos-input 
<sos> 名詞 助動詞 名詞 助詞 名詞 助詞 記号 名詞 名詞 助詞 名詞 助詞 副詞 接頭詞 名詞 助詞 名詞 助詞 助詞 名詞 <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
■ pos-target 
名詞 助動詞 名詞 助詞 名詞 助詞 記号 名詞 名詞 助詞 名詞 助詞 副詞 接頭詞 名詞 助詞 名詞 助詞 助詞 名詞 <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <p

## build model

In [11]:
from ptb import SOS_INDEX, EOS_INDEX, PAD_INDEX, UNK_INDEX

In [12]:
len(datasets[('train', 'src')].w2i), len(datasets[('valid', 'src')].w2i), len(datasets[('test', 'src')].w2i)

(10293, 10293, 10293)

In [13]:
len(datasets[('train', 'tgt')].w2i), len(datasets[('valid', 'tgt')].w2i), len(datasets[('test', 'tgt')].w2i)

(10293, 10293, 10293)

In [14]:
len(datasets[('train', 'pos')].w2i), len(datasets[('valid', 'pos')].w2i), len(datasets[('test', 'pos')].w2i)

(16, 16, 16)

In [15]:
vocab = {
    'text': {'w2i': datasets[('train', 'src')].w2i, }
}

In [16]:
args

<AttrDict{'data_dir': '/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/data/eccos_v2', 'create_data': False, 'src_max_sequence_length': 51, 'tgt_max_sequence_length': 51, 'pos_max_sequence_length': 48, 'min_occ': 1, 'test': True, 'epochs': 10, 'batch_size': 32, 'learning_rate': 0.001, 'embedding_size': 300, 'pos_embedding_size': 20, 'rnn_type': 'gru', 'hidden_size': 256, 'num_layers': 1, 'bidirectional': False, 'latent_size': 16, 'word_dropout': 0, 'embedding_dropout': 0.5, 'anneal_function': 'logistic', 'k': 0.0025, 'x0': 2500, 'print_every': 50, 'tensorboard_logging': True, 'logdir': '/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs', 'save_model_path': '/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs', 'experiment_name': 'posvae_eccos_v2', 'debug': False}>

In [17]:
model = POSVAE(
    vocab_size=datasets[('train', 'src')].vocab_size,
    pos_vocab_size=datasets[('train', 'pos')].vocab_size,
    embedding_size=args.embedding_size,
    pos_embedding_size=args.pos_embedding_size,
    
    rnn_type=args.rnn_type,
    hidden_size=args.hidden_size,
    word_dropout=args.word_dropout,
    embedding_dropout=args.embedding_dropout,
    latent_size=args.latent_size,
    num_layers=args.num_layers,
    bidirectional=args.bidirectional,
    
    tgt_max_sequence_length=args.tgt_max_sequence_length,
    pos_max_sequence_length=args.pos_max_sequence_length,
)

if torch.cuda.is_available():
    model = model.cuda()

In [18]:
model

POSVAE(
  (embedding): Embedding(10293, 300)
  (embedding_dropout): Dropout(p=0.5, inplace=False)
  (pos_embedding): Embedding(16, 20)
  (encoder_rnn): GRU(300, 256, batch_first=True)
  (pos_decoder_rnn): GRU(20, 256, batch_first=True)
  (hidden2mean): Linear(in_features=256, out_features=16, bias=True)
  (hidden2logv): Linear(in_features=256, out_features=16, bias=True)
  (latent2pos_decoder_hidden): Linear(in_features=16, out_features=256, bias=True)
  (outputs2pos): Linear(in_features=256, out_features=16, bias=True)
  (pos_encoder_rnn): GRU(20, 256, batch_first=True)
  (text_decoder_rnn): GRU(300, 256, batch_first=True)
  (latent2pos_encoder_hidden): Linear(in_features=16, out_features=256, bias=True)
  (outputs2vocab): Linear(in_features=256, out_features=10293, bias=True)
)

## log

In [19]:
def cstr(obj):
    return f'```{obj}```'

In [20]:
def str_dict(_dict):
    return '  \n'.join([f'{k}: {v}' for k,v in _dict.items()])

In [21]:
def get_meta_model_dict(model, args):
    meta_dict = {k:v for k, v in model.__dict__.items() if not k[0] == '_'}
    meta_dict.update(args.obj)
    return meta_dict

In [22]:
print(f'tensorboard logging: {args.tensorboard_logging}')
ts = time.strftime('%Y-%m-%d-%H%M%S', time.localtime())
exp_name = experiment_name(args,ts)

if args.tensorboard_logging:
    writer_path = os.path.join(args.logdir, exp_name)
    writer = SummaryWriter(writer_path)
    writer.add_text("model", cstr(model.__repr__().replace('\n', '  \n')))
    writer.add_text("args", cstr(str_dict(args.obj)))
    writer.add_text("ts", ts)
    print(f'▼tensorboard logging\n{writer_path}')
    
save_model_path = os.path.join(args.save_model_path, exp_name, 'models')
os.makedirs(save_model_path, exist_ok=True)
print(f'▼ model save\n{save_model_path}')

# メタパラメータ保存
with open(os.path.join(save_model_path, 'model_meta.json'), 'w') as f:
    meta_dict = get_meta_model_dict(model, args)
    json.dump(meta_dict, f)

tensorboard logging: True
▼tensorboard logging
/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs/posvae_eccos_v2_TS=2020-01-06-005355_BS=32_LR=0.001_EB=300_GRU_HS=256_L=1_BI=0_LS=16_WD=0_ANN=LOGISTIC_K=0.0025_X0=2500
▼ model save
/Users/s07309/gdrive/src/ca_dev/Sentence-VAE/runs/posvae_eccos_v2_TS=2020-01-06-005355_BS=32_LR=0.001_EB=300_GRU_HS=256_L=1_BI=0_LS=16_WD=0_ANN=LOGISTIC_K=0.0025_X0=2500/models


## optimizer

In [23]:
optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.Tensor
LongTensor = torch.cuda.LongTensor if torch.cuda.is_available() else torch.LongTensor
step = 0

In [24]:
datasets.keys()

odict_keys([('train', 'src'), ('train', 'tgt'), ('train', 'pos'), ('valid', 'src'), ('valid', 'tgt'), ('valid', 'pos'), ('test', 'src'), ('test', 'tgt'), ('test', 'pos')])

In [25]:
ae_datasets = {split: dataset for (split, src_tgt), dataset in datasets.items() if src_tgt == 'tgt'}
print(ids2text(ae_datasets['train'][0]['input'], ae_datasets['train']))
print(ids2text(ae_datasets['train'][0]['target'], ae_datasets['train']))

<sos> 水 なし ! <sep> 美容 成分 しか 入っ て ない ! <sep> セラミド <num> 倍 ジェル が やばい 笑 <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
水 なし ! <sep> 美容 成分 しか 入っ て ない ! <sep> セラミド <num> 倍 ジェル が やばい 笑 <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [26]:
data_types

['src', 'tgt', 'pos']

In [27]:
# datasets[('train', 'src')].data['1']

In [28]:
datasets[('train', 'src')]['1']

{'input': array([ 2, 21, 22, 23, 24, 25,  9, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 'target': array([21, 22, 23, 24, 25,  9, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
         3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 'length': 18}

In [29]:
# ループ内で扱う用に変形
_datasets = {}
for split in splits:
    dataset = []
    print(f"{split}: {len(datasets[(split, 'src')])}")
    for i in range(len(datasets[(split, 'src')])):
        _data = {}
        for data_type in data_types:
            d = datasets[(split, data_type)][f'{i}']
            _data.update({f'{data_type}_{k}': v for k, v in d.items()})
            _data.update({f'{data_type}_raw': d['input'][1:]})
            _data.update({f'{data_type}_raw_length': d['length'] - 1})
            
        dataset.append(_data)
    _datasets[split] = dataset
    if args.debug:
        _data_limit = 300
        _datasets[split] = dataset[:_data_limit]
        print(f'debug → {_data_limit}')

train: 44596
valid: 2477
test: 2479


In [30]:
train_target_ptb = datasets[('train', 'tgt')]
train_target_ptb

<ptb.PTB at 0x1a2e447048>

In [31]:
from utils import ids2ptext
from metric import write_tensorboard_valid_metric, remove_pad_index

# Main

In [32]:
for epoch in range(args.epochs):

    for split in splits:
        
        data_loader = DataLoader(
            dataset=_datasets[split],
            batch_size=args.batch_size,
            shuffle=split=='train',
            num_workers=cpu_count(),
            pin_memory=torch.cuda.is_available()
        )

        tracker = defaultdict(Tensor)

        # Enable/Disable Dropout
        if split == 'train':
            model.train()
        else:
            model.eval()

        for iteration, batch in enumerate(data_loader):
            batch_size = batch['src_input'].size(0)
            
            for k, v in batch.items():
                if torch.is_tensor(v):
                    batch[k] = to_var(v)
            
            # model output
            b = label_dict = batch
            out_dict = model(b['src_input'], b['src_length'], b['pos_input'], b['pos_length'])
            z = out_dict['z']
            # loss calculation
            loss_dict = model.loss(out_dict, label_dict, 100, args)
            loss = loss_dict['loss']

            # backward + optimization
            if split == 'train':
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                step += 1

            # for log
            loss_value_dict = {}
            for _name, _loss in loss_dict.items():
                loss_value = _loss.detach().item() if 'detach' in dir(_loss) else _loss
                if _name not in ['loss', 'KL_weight']:
                    loss_value /= batch_size
                loss_value_dict[_name] = loss_value
                
            # bookkeepeing
            tracker['Loss'] = torch.cat((tracker['Loss'], loss.detach().view(1)))

            if args.tensorboard_logging:
                _ = [writer.add_scalar(f'{split.upper()}/{name}', value, epoch*len(data_loader) + iteration) for name, value in loss_value_dict.items()]

            if iteration % args.print_every == 0 or iteration+1 == len(data_loader):
                print_text = f'{split.upper()} Batch {iteration}/{len(data_loader)-1}'
                for k, v in loss_value_dict.items():
                    print_text += f', {k} {v:9.4f}'
                print(print_text)

            if split == 'valid':
                def add_tokens_tracker(id_key, token_key, ids, dataset_key, sep=''):
                    tracker[id_key] = torch.cat((tracker.get(id_key, LongTensor()), ids.detach()), dim=0)
                    tracker[token_key] = tracker.get(token_key, []) + [ids2ptext(text_ids, datasets[dataset_key].i2w, sep=sep) for text_ids in ids]
                
                add_tokens_tracker('target_text_ids', 'target_texts', batch['tgt_target'].data, ('train', 'tgt'))
                add_tokens_tracker('target_pos_ids', 'target_poses', batch['pos_target'].data, ('train', 'pos'), sep=' ')
                tracker['z'] = torch.cat((tracker['z'], z.detach()), dim=0)
                with torch.no_grad():
                    # Inference
                    pos_decoded_ids = model.pos_inference(z=z) # z → POS
                    pos_decoded_input, pos_decoded_length = model.target2input(pos_decoded_ids)  # POS → POS Input
                    text_decoded_ids = model.text_inference(pos_decoded_input, pos_decoded_length, z) # POS Input → Text
                    add_tokens_tracker('decoded_text_ids', 'decoded_texts',  text_decoded_ids, ('train', 'tgt'))
                    add_tokens_tracker('decoded_pos_ids', 'decoded_poses',  pos_decoded_ids, ('train', 'pos'), sep=' ')

        print("%s Epoch %02d/%i, Mean Loss %9.4f"%(split.upper(), epoch, args.epochs, torch.mean(tracker['Loss'])))

        if args.tensorboard_logging:
            writer.add_scalar("%s-Epoch/Loss"%split.upper(), torch.mean(tracker['Loss']), epoch)
        
        if split == 'valid':
            decoded_id_list = remove_pad_index(tracker['decoded_text_ids'])
            valid_tgt_id_list = remove_pad_index(tracker['target_text_ids'])
            train_tgt_id_list = remove_pad_index([d['tgt_target'] for d in _datasets['train']]) # コピー率用
            write_tensorboard_valid_metric(writer, valid_tgt_id_list, decoded_id_list, train_tgt_id_list, datasets[('train', 'tgt')].i2w, split, epoch)

        # save checkpoint
        if split == 'train':
            checkpoint_path = os.path.join(save_model_path, f"model_E{epoch}.pytorch")
            torch.save(model.state_dict(), checkpoint_path)
            print("Model saved at %s"%checkpoint_path)

TRAIN Batch 0/1393, loss  210.4560, POS_NLL_loss   45.1309, TEXT_NLL_loss  165.3242, KL_weight    0.0025, KL_loss    0.3491


KeyboardInterrupt: 