# Inference

In [14]:
import os
import json
import torch
import argparse
import numpy as np

from model_gumbel import SentenceVAE
from utils import to_var, idx2word, interpolate, AttributeDict

In [15]:
import pandas as pd
pd.set_option("display.max_colwidth", 500) # 1セルに500文字入る
pd.set_option("display.max_rows", 100) # 100行表示できる

In [124]:
model_dict = {
#     'cvae_bowloss': './bin/2019-Dec-01-08:47:43/E9.pytorch',
#     'cvae_latent200': './bin/2019-Dec-01-10:27:08/E9.pytorch',
#     'cvae_bowloss_latent200': './bin/2019-Dec-01-10:25:05/E9.pytorch',
#     'cvae_bowloss_latent200_epoch100': './bin/2019-Dec-02-11:02:29/E99.pytorch',
#     'vae': './bin/2019-Dec-03-05:31:25/E9.pytorch',
#     'vae_gumbel_latent200_tau0.1_epoch100': './bin/2019-Dec-03-12:12:33/E99.pytorch',
    'vae_gumbel_latent16_tau0.1_epoch20': './bin/2019-Dec-05-10:01:18/E19.pytorch',
    'vae_gumbel_latent200_tau0.1_epoch20': './bin/2019-Dec-05-10:01:32/E19.pytorch',
    'vae_gumbel_latent16_tau0.5_epoch20': './bin/2019-Dec-05-10:01:34/E19.pytorch',
    'vae_gumbel_latent16_tau1_epoch20': './bin/2019-Dec-05-10:19:22/E19.pytorch',
    'vae_gumbel_latent16_tau5_epoch20': './bin/2019-Dec-05-10:20:10/E19.pytorch',
    'vae_gumbel_latent16_tau30_epoch20': './bin/2019-Dec-05-17:48:20/E19.pytorch',
}
tau_dict = {
#     'vae_gumbel_latent200_tau0.1_epoch100': 0.1,
    'vae_gumbel_latent16_tau0.1_epoch20': 0.1,
    'vae_gumbel_latent200_tau0.1_epoch20': 0.1,
    'vae_gumbel_latent16_tau0.5_epoch20': 0.5,
    'vae_gumbel_latent16_tau1_epoch20': 1,
    'vae_gumbel_latent16_tau5_epoch20': 5,
    'vae_gumbel_latent16_tau30_epoch20': 30,
}

In [17]:
args = {
    'num_samples': 10,
    'max_sequence_length': 50,
    'embedding_size': 300,
    'rnn_type': 'gru',
    'hidden_size': 256,
    'word_dropout': 0,
    'embedding_dropout': 0.5,
    'num_layers': 1,
    'bidirectional': False
}
args = AttributeDict(args)
args.rnn_type = args.rnn_type.lower()
assert args.rnn_type in ['rnn', 'lstm', 'gru']
assert 0 <= args.word_dropout <= 1

## utils

In [18]:
def to_tensor(arr_like, cuda=True):
    tensor = torch.Tensor(arr_like)
    return tensor if not cuda else tensor.cuda()

## load vocab

In [19]:
with open('./data/eccos/src/ptb.vocab.json', 'r') as file:
        src_vocab = json.load(file)
src_w2i, src_i2w = src_vocab['w2i'], src_vocab['i2w']

In [20]:
with open('./data/eccos/tgt/ptb.vocab.json', 'r') as file:
        tgt_vocab = json.load(file)
tgt_w2i, tgt_i2w = tgt_vocab['w2i'], tgt_vocab['i2w']

In [21]:
len(src_w2i), len(tgt_w2i)

(5619, 12106)

In [22]:
# args.obj['cond_embedding_size'] = 300
# args.obj['cond_hidden_size'] = 256

## load model

In [23]:
from ptb import SOS_INDEX, EOS_INDEX, PAD_INDEX, UNK_INDEX

In [24]:
# _dict = torch.load(model_dict['vae_gumbel'])
# model_shapes = {k: v.shape for k,v in _dict.items()}
# model_shapes

In [25]:
def load_model(name):
    path = model_dict[name]
    _dict = torch.load(path)
    model_shapes = {k: v.shape for k,v in _dict.items()}
    ext_kwargs = {}
    
    # BOW Loss
    bow_hidden_shape = model_shapes.get('latent2bow.0.weight')
    use_bow_loss = bow_hidden_shape is not None
    print(f'BOW Loss: {use_bow_loss}')
    if use_bow_loss:
        ext_kwargs['bow_hidden_size'] = bow_hidden_shape[0]
    else:
        ext_kwargs['use_bow_loss'] = False
        
    # Latent size
    latent_size = model_shapes.get('hidden2logv.bias')[0]
    print(f'Latent size: {latent_size}')
    
    # Gumbel
    gumbel_vocab_size, gumbel_embedding_size = model_shapes.get('hidden2gumbel.weight', [None, None])
    is_gumbel = gumbel_vocab_size is not None
    print(f'Gumbel: {is_gumbel}')
    if is_gumbel:
        ext_kwargs['is_gumbel'] = is_gumbel
        ext_kwargs['gumbel_tau'] = tau_dict[name]
    print(ext_kwargs)
        
    model = SentenceVAE(
        vocab_size=len(tgt_w2i),
        sos_idx=SOS_INDEX,
        eos_idx=EOS_INDEX,
        pad_idx=PAD_INDEX,
        unk_idx=UNK_INDEX,
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional,        
        **ext_kwargs,
    )
    print(model)
    
    if not os.path.exists(path):
        raise FileNotFoundError(path)

    model.load_state_dict(torch.load(path))
    print("Model loaded from %s"%(path))

    if torch.cuda.is_available():
        model = model.cuda()
        
    model.eval()
    return model

In [26]:
model = load_model('vae_gumbel_latent16_tau1_epoch20')

BOW Loss: False
Latent size: 16
Gumbel: True
{'use_bow_loss': False, 'is_gumbel': True, 'gumbel_tau': 1}
SentenceVAE(
  (embedding): Embedding(12106, 300)
  (decoder_embedding): Embedding(12106, 300)
  (embedding_dropout): Dropout(p=0.5, inplace=False)
  (encoder_rnn): GRU(300, 256, batch_first=True)
  (decoder_rnn): GRU(300, 256, batch_first=True)
  (hidden2gumbel): Linear(in_features=256, out_features=12106, bias=True)
  (hidden2mean): Linear(in_features=300, out_features=16, bias=True)
  (hidden2logv): Linear(in_features=300, out_features=16, bias=True)
  (latent2hidden): Linear(in_features=16, out_features=256, bias=True)
  (outputs2vocab): Linear(in_features=256, out_features=12106, bias=True)
)
Model loaded from ./bin/2019-Dec-05-10:19:22/E19.pytorch


## sample 指定 inference

### load test data

In [27]:
from ptb import PTB
test_tgt_ptb = PTB(
    data_dir='./data/eccos/tgt/',
    split='test',
    create_data=False,
)

In [28]:
# 実際のデータ確認用
def ids2text(id_list, ptb, sep=''):
    return sep.join([ptb.i2w[f'{i}'] for i in id_list])

def ids2ptext(*args, **kwags):
    text = ids2text(*args, **kwags)
    return text.replace('<eos>', '').replace('<pad>', '').replace('<sos>', '')

In [29]:
def words2ids(words, ptb):
    assert type(words) == list
    return [ptb.w2i.get(word, UNK_INDEX) for word in words]

def words2sample(words, ptb):
    id_list = [SOS_INDEX] + words2ids(words, ptb)
    sample = {'input': id_list, 'length': len(id_list)}
    return sample

In [30]:
def sample_data(index):
    sample = test_tgt_ptb.data[f'{index}']
    sample_input = to_tensor(sample['input']).view(1,-1).to(dtype=torch.int64)
    sample_length = to_tensor([sample['length']]).to(dtype=torch.int64)
    return sample_input, sample_length

In [31]:
# データをピックアップ, 確認
# sample = words2sample('発想 日焼け止め スキンケア'.split(), test_tgt_ptb)
sample_input, sample_length = sample_data(242)
print(f'▼ Input length:{sample_length}\n{ids2ptext(sample_input[0], test_tgt_ptb)}')

▼ Input length:tensor([36], device='cuda:0')
すっぴんが可愛い子の共通<unk>として、スタイルも綺麗。メイクに頼らずに素から美人になるためにもダイエットを始めてみませんか?


### sampling

In [32]:
def encode(model, sample_input, sample_length):
    # gumbel softmax
    with torch.no_grad():
        hidden = model.encode(sample_input, sample_length)
        gumbel_softmax = model.gumbel_softmax(hidden)
        hidden = torch.matmul(gumbel_softmax, model.embedding.weight)
        mean, logv, z = model.hidden2latent(hidden)
    return {
        'hidden': hidden,
        'gumbel_softmax': gumbel_softmax,
        'mean': mean,
        'logv': logv,
        'z': z,
    }

In [40]:
# _softmax = np.array(softmax.tolist())[0]
is_valid = _softmax > 1/len(test_tgt_ptb.i2w)
valid_softmax = _softmax[is_valid]
sort_index = valid_softmax.argsort()

In [43]:
(_softmax > 0).sum()

2

array([ 901, 2328, 4030, ..., 8066, 8065,    0])

In [51]:
_softmax[is_valid]

array([1.])

In [52]:
np.where(is_valid)

(array([901]),)

In [133]:
1/len(test_tgt_ptb.i2w)

8.260366760284156e-05

In [96]:
is_valid = _softmax > 1/len(test_tgt_ptb.i2w)
sort_valid_index = _softmax.argsort()[::-1]

In [99]:
sort_valid_index[:is_valid.sum()]

array([ 3314, 11606])

In [100]:
def softmax_index(softmax, threshold):
    global _softmax, is_valid
    _softmax = np.array(softmax.tolist())[0]
    is_valid = _softmax > threshold
    sort_valid_index = _softmax.argsort()[::-1][:is_valid.sum()]
#     valid_softmax = _softmax[is_valid]
#     gs = [{'p': p, 'index': i} for p, i in zip(_softmax[is_valid], np.where(is_valid))]
    return list(_softmax[sort_valid_index]), list(sort_valid_index)

In [101]:
def encode_print_samples(model, sample_input, sample_length, n=30):
    # n回サンプリング
    args.num_samples = n
    encoded = encode(model, sample_input, sample_length)
    mean, std = encoded['mean'].squeeze(), torch.exp(0.5 * encoded['logv']).squeeze()
    z_dist = torch.distributions.normal.Normal(mean, std)
    z_list = z_dist.sample((n, ))
    
    global gumbel_index_list
    gumbel_p_list, gumbel_index_list = softmax_index(encoded['gumbel_softmax'], 1/len(test_tgt_ptb.i2w))
    samples, _ = model.inference(z=z_list)
    print('■ 入力')
    print(ids2ptext(sample_input.squeeze(), test_tgt_ptb))
    print('■ 抽出単語')
    print(ids2ptext(gumbel_index_list, test_tgt_ptb, sep=' '))
    print(f'■ {args.num_samples}件 サンプリング')
    print(*[s.replace('<eos>', '').replace(' ', '') for s in idx2word(samples, i2w=tgt_i2w, pad_idx=PAD_INDEX)], sep='\n')

In [102]:
import random
sample_index_list = random.sample(range(0, len(test_tgt_ptb)), 1000)

In [103]:
def encode_test_data(model, index_list):
    with torch.no_grad():
        encoded_list = []
        for i, data_i in enumerate(index_list):
            print(f'\r{i}/{len(index_list)}', end='')
            sample_input, sample_length = sample_data(data_i)
            encoded = encode(model, sample_input, sample_length)

            decoded_ids, _ = model.inference(z=encoded['mean'])

            input_text = ids2ptext(sample_input.squeeze(), test_tgt_ptb)

            # 単語抽出
            gumbel_p_list, gumbel_index_list = softmax_index(encoded['gumbel_softmax'], 1/len(test_tgt_ptb))
            gumbel_kws = [test_tgt_ptb.i2w[f'{i}'] for i in gumbel_index_list]
            decoded_text = ids2ptext(decoded_ids.squeeze(), test_tgt_ptb)

            encoded_list.append({
                'input_text': input_text,
                'gumbel_kws': gumbel_kws,
                'gumbel_probs': gumbel_p_list,
                'decoded_text': decoded_text,
                # **encoded,
            })
        return encoded_list

In [126]:
%%time
def load_and_encode(name):
    print(name)
    model = load_model(name)
    encoded_list = encode_test_data(model, sample_index_list)
    return encoded_list
    
model_test_encoded = {name: load_and_encode(name) for name in model_dict.keys() if 'latent16' in name}

vae_gumbel_latent16_tau0.1_epoch20
BOW Loss: False
Latent size: 16
Gumbel: True
{'use_bow_loss': False, 'is_gumbel': True, 'gumbel_tau': 0.1}
SentenceVAE(
  (embedding): Embedding(12106, 300)
  (decoder_embedding): Embedding(12106, 300)
  (embedding_dropout): Dropout(p=0.5, inplace=False)
  (encoder_rnn): GRU(300, 256, batch_first=True)
  (decoder_rnn): GRU(300, 256, batch_first=True)
  (hidden2gumbel): Linear(in_features=256, out_features=12106, bias=True)
  (hidden2mean): Linear(in_features=300, out_features=16, bias=True)
  (hidden2logv): Linear(in_features=300, out_features=16, bias=True)
  (latent2hidden): Linear(in_features=16, out_features=256, bias=True)
  (outputs2vocab): Linear(in_features=256, out_features=12106, bias=True)
)
Model loaded from ./bin/2019-Dec-05-10:01:18/E19.pytorch
999/1000vae_gumbel_latent16_tau0.5_epoch20
BOW Loss: False
Latent size: 16
Gumbel: True
{'use_bow_loss': False, 'is_gumbel': True, 'gumbel_tau': 0.5}
SentenceVAE(
  (embedding): Embedding(12106, 3

In [105]:
# model_test_encoded['vae_gumbel_latent16_tau0.1_epoch20'][3]

In [256]:
m = model_test_encoded['vae_gumbel_latent16_tau0.1_epoch20']

In [127]:
df_data = []
for name, e in model_test_encoded.items():
    df = pd.DataFrame(e)
    df['name'] = name
    df_data.append(df)

In [128]:
df_all_data = pd.concat(df_data)
df_all_data['kw_num'] = df_all_data.gumbel_kws.apply(lambda x: len(x))

In [129]:
df = df_all_data[['name', 'input_text', 'gumbel_kws', 'gumbel_probs', 'kw_num', 'decoded_text']]

In [141]:
df.sample(1)

Unnamed: 0,name,input_text,gumbel_kws,gumbel_probs,kw_num,decoded_text
653,vae_gumbel_latent16_tau30_epoch20,冬こそダメージケア!<person>髪になれるヘアケアアイテム,"[メガホン, 乾かす, エキス, パック, ドキッ, 勝負服, 変えれ, ラボ, 靴擦れ, 愛らしい, 締まっ, あなた色, アイケア, あれ, ヶ所, 温, 言い, 落とせる, バングス, 早期, チェック, 近道, 品切れ, 圧倒的, すっ, ので, 白肌, 柔らかい肌, 入れる, 左右, ヶ月, しょう, 年度, モテ仕草, きれい, 実施, 確かめ, カシミヤ, 創業, 既に, 意外, 雨, 今晩, 破産法, れる, クリスマスプレゼント, アディクション, 未知, サポート, セミロング, ≪, 陶器, 年間, 境界線, ω, アイメイク, 宝庫, アボカド, 診断, 色素, 噂, ツヤ感, 一部, しまっ, サービス, 探し, 努力, 結果, つらーい, スイーツレシピ, 追加, ほど, うんざり, なかっ, 果物, jk, クオリティ, 暑かっ, 一生, 体, お家, ピッタリ, 眠い, 知っ, だけ, まゆ毛, 竹下通り, 経験, 黒, ガール, 上げ, 慣れ, 会, がっかり, ニット, 飯, 合っ, リキッドルージュ, なんか, コラボ, ...]","[0.0032996749505400658, 0.0030826895963400602, 0.0026851564180105925, 0.0026479666121304035, 0.0009239482460543513, 0.000915618147701025, 0.0008479719399474561, 0.0007803154294379056, 0.0007802715408615768, 0.0007713402155786753, 0.0007645838195458055, 0.0007389404345303774, 0.0007341925520449877, 0.0007017346215434372, 0.0006770046311430633, 0.0006366467569023371, 0.0006162656936794519, 0.0006115199066698551, 0.0005706542287953198, 0.0005610229563899338, 0.0005530543276108801, 0.00052332889...",3064,<person>の新作コスメがかわいすぎる!


In [142]:
df[df.input_text=='冬こそダメージケア!<person>髪になれるヘアケアアイテム']

Unnamed: 0,name,input_text,gumbel_kws,gumbel_probs,kw_num,decoded_text
653,vae_gumbel_latent16_tau0.1_epoch20,冬こそダメージケア!<person>髪になれるヘアケアアイテム,[デイクリーム],[1.0],1,【<num>年最新版】<num>月号の雑誌付録が豪華すぎる!<num>月号雑誌付録がどれも豪華すぎる!
653,vae_gumbel_latent16_tau0.5_epoch20,冬こそダメージケア!<person>髪になれるヘアケアアイテム,"[ベストコスメランキング, させる, 体重計]","[0.8605139851570129, 0.09401451051235199, 0.045468688011169434]",3,【<num>年最新版】人気のbbクリームまとめ
653,vae_gumbel_latent16_tau1_epoch20,冬こそダメージケア!<person>髪になれるヘアケアアイテム,"[大胆, リプライ, 金額, 見落とし, 可愛く, 豆, 変えよ, 長所, 気, ふさわしい, 背, もらい, リムーバー, 妖精, 休暇, 咲き誇る, 沿線, による, 養蜂, 高さ, エビ, 結婚, レベル, に学ぶ, モチーフコスメ, 定額制, 性, 韓国式, 給料, 比較, お姫様, 太っ, 大き, 徹底的, 光と影, ソックス, きゅうり, スポットライト, 難しいー, 深い, ころん, 浸透, ボディショップ, むら, 神奈川, ポリッシャー, いえ, 手抜き, 再入荷, フラット, 度, ハイクオリティ]","[0.4670945107936859, 0.3388666808605194, 0.03131188824772835, 0.02235434763133526, 0.02213246189057827, 0.019365014508366585, 0.018594296649098396, 0.011130429804325104, 0.01047907117754221, 0.010127858258783817, 0.00926455482840538, 0.006829794961959124, 0.0068010143004357815, 0.004563991446048021, 0.0020591109059751034, 0.001619383692741394, 0.001512541202828288, 0.0012544573983177543, 0.0011133374646306038, 0.0009107079240493476, 0.0007614127243869007, 0.0007026929524727166, 0.00067313225...",52,【<num>年版】人気のリップクリーム<num>選!
653,vae_gumbel_latent16_tau5_epoch20,冬こそダメージケア!<person>髪になれるヘアケアアイテム,"[夕方, 目尻, ドンキ, 本館, 勢揃い, t, 至極, もちっと, 間に合う, 恒例, アロマ, 控えめ, ぞ, 不動, 忍ばせ, 小売り, 歩む, ジブリ, 勢い, メルヴェイユーズラデュレ, カサカサ, スポーツ, 光る, い, マジ, ‍, パーフェクトワン, おめかし, 困る, 告白, サービス, 新ブランド, という, レブロン, とろり, 密閉, ドモホルンリンクル, 紫外線, ラガーフェルド, サポート, 平行, jj, 赤ちゃん, 使い, 脱い, 利き手, 順, カタログ, 純度, gw, 南国, ジワジワ, 実力, わたし, 泊まり, サークル, 下半身, もの, みずみずしい, 立体的, コラボ, コロっと, エーザイ, ホール, <person>, ポーチインコスメ, 昼ごはん, 広がり, 色白, 定期購入, 巡っ, ストリート, もち肌, 新感覚, デジタル, なんとか, _, 盛り上がり, シトラス, ビストロ, まくり, 摂り, モノクロ, 仲良く, 内定, サテン, つけ, 楽ちん, たまらない, 年末年始, くずれ, エイジング, なー, も...","[0.005914709530770779, 0.005350309889763594, 0.003935209475457668, 0.003222839208319783, 0.003156389342620969, 0.0028806617483496666, 0.002791693666949868, 0.00248090666718781, 0.0024229073897004128, 0.002398343291133642, 0.0023278705775737762, 0.002292066579684615, 0.002185804070904851, 0.002153356559574604, 0.002149736974388361, 0.0020031158346682787, 0.002002231776714325, 0.0019839939195662737, 0.0019766660407185555, 0.0019562651868909597, 0.001936300890520215, 0.0018561322940513492, 0.00...",2399,お風呂上がりにおすすめ!<unk>で美肌を叶える方法
653,vae_gumbel_latent16_tau30_epoch20,冬こそダメージケア!<person>髪になれるヘアケアアイテム,"[メガホン, 乾かす, エキス, パック, ドキッ, 勝負服, 変えれ, ラボ, 靴擦れ, 愛らしい, 締まっ, あなた色, アイケア, あれ, ヶ所, 温, 言い, 落とせる, バングス, 早期, チェック, 近道, 品切れ, 圧倒的, すっ, ので, 白肌, 柔らかい肌, 入れる, 左右, ヶ月, しょう, 年度, モテ仕草, きれい, 実施, 確かめ, カシミヤ, 創業, 既に, 意外, 雨, 今晩, 破産法, れる, クリスマスプレゼント, アディクション, 未知, サポート, セミロング, ≪, 陶器, 年間, 境界線, ω, アイメイク, 宝庫, アボカド, 診断, 色素, 噂, ツヤ感, 一部, しまっ, サービス, 探し, 努力, 結果, つらーい, スイーツレシピ, 追加, ほど, うんざり, なかっ, 果物, jk, クオリティ, 暑かっ, 一生, 体, お家, ピッタリ, 眠い, 知っ, だけ, まゆ毛, 竹下通り, 経験, 黒, ガール, 上げ, 慣れ, 会, がっかり, ニット, 飯, 合っ, リキッドルージュ, なんか, コラボ, ...]","[0.0032996749505400658, 0.0030826895963400602, 0.0026851564180105925, 0.0026479666121304035, 0.0009239482460543513, 0.000915618147701025, 0.0008479719399474561, 0.0007803154294379056, 0.0007802715408615768, 0.0007713402155786753, 0.0007645838195458055, 0.0007389404345303774, 0.0007341925520449877, 0.0007017346215434372, 0.0006770046311430633, 0.0006366467569023371, 0.0006162656936794519, 0.0006115199066698551, 0.0005706542287953198, 0.0005610229563899338, 0.0005530543276108801, 0.00052332889...",3064,<person>の新作コスメがかわいすぎる!


In [273]:
df[df.input_text=='☆艶肌・ツル肌目指す洗顔料☆']

Unnamed: 0,name,input_text,gumbel_kws,gumbel_probs,decoded_text


In [274]:
df.name.unique()

array(['vae_gumbel_latent16_tau0.1_epoch20',
       'vae_gumbel_latent200_tau0.1_epoch20',
       'vae_gumbel_latent16_tau1_epoch20',
       'vae_gumbel_latent16_tau5_epoch20'], dtype=object)

In [252]:
# m_df = df[df.input_text=='ノーファンデ女子急増中!<person>が愛用するコスメはプロの現場で重宝されている実力派!「同世代に自信をもっておすすめしたいスキンケアです!']
# m_df

In [253]:
# import pytablewriter
# writer = pytablewriter.MarkdownTableWriter()
# writer.from_dataframe(m_df)
# writer.write_table()

In [180]:
# encoded_list[0]

In [181]:
# encoded_list[0]

In [182]:
import pandas as pd

In [191]:
df = pd.DataFrame(encoded_list)
df['kw_str'] = df.gumbel_kws.apply(lambda x: ' '.join(x))
df = df[['input_text', 'gumbel_kws', 'kw_str', 'gumbel_probs', 'decoded_text']]

In [255]:
# df[['input_text', 'gumbel_kws', 'gumbel_probs', 'decoded_text']].sample(10)

In [81]:
# ! pip install japanize_matplotlib

In [116]:
import matplotlib.pyplot as plt
import japanize_matplotlib
plt.figure(figsize=(10, 5))
kw_count = df.groupby('kw_size').count()
# plt.title('横軸：キーワード数, 縦軸：レコード数（テストデータ）')
# plt.xticks(kw_count.index)
# plt.bar(kw_count.index, kw_count)

<Figure size 720x360 with 0 Axes>

In [118]:
kw_count[['count']]

Unnamed: 0_level_0,count
kw_size,Unnamed: 1_level_1
1,6325
2,345
3,14
4,1


In [108]:
kw_count.shape

(52, 4)

In [115]:
df['count'] = 1
kw_count = df.groupby('kw_str').count()
kw_count.sort_values('count', ascending=False)[['count']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,count
kw_str,Unnamed: 1_level_1
絵本,925
鮮やか,695
頑張る,608
びや,607
日本酒,599
再販,560
考える力,523
覚悟,518
スピルリナ,481
ハイヒール,365


In [91]:
df.groupby('kw_size').count().input_text.index

Int64Index([1, 2, 3, 4], dtype='int64', name='kw_size')

In [121]:
# df[df.kw_str == 'スピルリナ']

In [32]:
gs = [e[''] for e in encoded_list]

In [35]:
gs_w = [[test_tgt_ptb.i2w[f'{i}'] for i in ids] for ids in gs]

In [128]:
gs_wl = []
for e in encoded_list:
    gs_wl += e['gumbel_kws']

In [130]:
# gs_wl

In [132]:
len(gs_wl)

9603

In [133]:
len(set(gs_wl))

1

In [51]:
set(gs_wl)

{'はじめよ',
 'びや',
 'スピルリナ',
 'ハイヒール',
 'パーカー',
 '再販',
 '培養',
 '平成最後',
 '日本酒',
 '染み',
 '絵本',
 '考える力',
 '覚悟',
 '頑張る',
 '鮮やか'}

In [None]:
sample_input, sample_length = sample_data(37)
encode_print_samples(model, sample_input, sample_length, 10)

## 潜在空間のプロット
圧縮して分布を確認してみる

In [122]:
import umap.umap_ as umap

In [None]:
# 描画用
# https://github.com/lmcinnes/umap/blob/master/notebooks/UMAP%20usage%20and%20parameters.ipynb
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
%matplotlib inline
sns.set(style='white', rc={'figure.figsize':(12,8)})
def plot_scatter(u, title=''):
    fig = plt.figure()
    n_components = u.shape[1]
    plot_kwargs = {'alpha': 0.5, 's':5}
    if n_components == 1:
        ax = fig.add_subplot(111)
        ax.scatter(u[:,0], range(len(u)), **plot_kwargs)
    if n_components == 2:
        ax = fig.add_subplot(111)
        ax.scatter(u[:,0], u[:,1], **plot_kwargs)
    if n_components == 3:
        ax = fig.add_subplot(111, projection='3d', **plot_kwargs)
        ax.scatter(u[:,0], u[:,1], u[:,2])
    plt.title(title, fontsize=18)

### load test data

In [None]:
from ptb import PTB
src_test_ptb = PTB(
    data_dir='./data/eccos/src/',
    split='test',
    create_data=False,
)

In [None]:
tgt_test_ptb = PTB(
    data_dir='./data/eccos/tgt/',
    split='test',
    create_data=False,
)

In [None]:
# エンコードする対象を指定する
test_ptb = tgt_test_ptb

In [None]:
def sample_to_tensor(sample):
    sample_input = to_tensor(sample['input']).view(1,-1).to(dtype=torch.int64)
    sample_length = to_tensor([sample['length']]).to(dtype=torch.int64)
    return sample_input, sample_length

In [None]:
%%time
# テストデータを潜在変数に変換
with torch.no_grad():
    # encoded_samples = [model.encode(*sample_to_tensor(sample)) for i, sample in test_ptb.data.items()]
    if model.is_conditional:
        print('Encode Condition...')
        encoded_samples = [model.encode_condition(*sample_to_tensor(sample)) for i, sample in test_ptb.data.items()]
        encoded_mean_list = [cond_mean.tolist() for cond_hidden, cond_mean, cond_logv, cond_z in encoded_samples]
    else:
        print('Encode...')
        encoded_samples = [model.encode(*sample_to_tensor(sample)) for i, sample in test_ptb.data.items()]
        encoded_mean_list = [mean.tolist() for mean, logv, z in encoded_samples]

In [None]:
encoded_mean_arr = np.array(encoded_mean_list).squeeze()

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%%time
mean_u = umap.UMAP().fit_transform(encoded_mean_arr)

In [None]:
plot_scatter(mean_u)

### TensorBoard Embedding Projectorへ

In [None]:
import pandas as pd
limit = 5000
df = pd.DataFrame(test_ptb.data).T
test_label_list = [ids2ptext(target, test_ptb) for target in df.target.tolist()]
from torch.utils.tensorboard import SummaryWriter
log_dir = '/root/user/work/logs'
model_name = f'gumbel_vae_latent16_epoch100_n={limit}'
writer = SummaryWriter(f'{log_dir}/{model_name}')
writer.add_embedding(torch.FloatTensor(encoded_mean_arr[:limit]), metadata=test_label_list[:limit])

# Embedding検証

In [None]:
model

In [None]:
model.embedding.weight

In [None]:
model.embedding.weight

In [None]:
def make_onehot(indexs):
    h = torch.zeros(model.embedding.weight.shape[0]).cuda()
    h[indexs] = 1
    return h

In [None]:
indexs = [0, 1]
onehot = make_onehot(indexs)

In [None]:
onehot

In [None]:
a1 = torch.matmul(h, model.embedding.weight)
a1[:5]

In [None]:
a2 = model.embedding(torch.tensor(indexs).cuda()).sum(dim=0)
a1[:5]

In [None]:
(a1 != a2).sum().item()

In [None]:
a1.shape