# English - Japanese Embeddings (3 versions)
`w266 Final Project: Crosslingual Word Embeddings`

Instead of traning on randomly substituted words, here we'll choose the translation that is closest to the context embedding vector.

# Notebook Setup

In [1]:
# general imports
from __future__ import print_function
import time
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

# tell matplotlib not to open a new window
%matplotlib inline

# autoreload modules
%load_ext autoreload
%autoreload 2

__Base Paths__

In [2]:
# Maya's paths
#BASE = '/home/mmillervedam/Data'
#PROJ = '/home/mmillervedam/ProjectRepo'

# Mona;s paths
BASE = '/home/miwamoto/Data'
PROJ = '/home/miwamoto/W266-Fall-2017-Final-Project'

GTT_BASE = PROJ + '/BaselineModels/data/ground_truth_translations/'

# directory to save pickled embeddings
SAVE_TO = BASE + '/embeddings'

__Globals__ - _the parameters below fully determine all 3 models in this NB_

In [3]:
# Data
LANG = ('en','ja')
FULL_TEXT = "/home/miwamoto/en_ja_shuf.txt"
VOCAB_INDEX = BASE + '/vocab/en_ja_small.pkl'
PANLEX = BASE + '/panlex/en_ja_dict.pkl'
GTT_PATH = GTT_BASE + "%s-%s-clean.csv" % (LANG[0], LANG[1])

# Model
EMBEDDING_SIZE = 200

# Training
nBATCHES = 50000 # <<< 1 epoch with our 1 million sentence corpus
BATCH_SIZE = 48
WINDOW_SIZE = 4
MAX_EPOCHS = 5 # fail safe
ALPHA = 0.5 # authors use a much smaller learning rate but train longer

# Load Data

In [4]:
from parsing import Corpus, BilingualVocabulary, batch_generator, get_common_words

In [5]:
# load corpus
raw_data = Corpus(FULL_TEXT)

In [6]:
# MI 
PANLEX = '/home/miwamoto/Data/panlex/en_ja_dict.pkl'

In [7]:
# load panlex dictionary
with open(PANLEX,'rb') as f:
    translations = pickle.load(f)

In [9]:
# load vocabulary
vocab = BilingualVocabulary([], languages = LANG)
with open(VOCAB_INDEX,'rb') as f:
    vocab.load_from_index(pickle.load(f))

In [10]:
# confirmations
print('... loaded %s panlex translations'%(len(translations)))
print('... loaded %s word %s vocabulary'%(vocab.size,vocab.language))

... loaded 634705 panlex translations
... loaded 20003 word ('en', 'ja') vocabulary


In [22]:
# Validation Words (for training printout)
TEST_WORDS = vocab.to_ids(['en_the','en_last', 'ja_月', 'ja_日本'])
print('... test word ids:', TEST_WORDS)
for i in range(9990,10020):
    print(vocab.index[i])
#print(vocab.wordset)

... test word ids: [3, 228, 10004, 10012]
en_exemption
en_bohemian
en_walnut
en_ljubljana
en_timor
en_venom
en_scriptures
en_tariff
en_penetration
en_pedal
en_transmissions
en_fluent
en_sexes
ja_年
ja_月
ja_日
ja_的
ja_3
ja_第
ja_人
ja_者
ja_後
ja_日本
ja_行う
ja_中
ja_一
ja_現在
ja_時
ja_化
ja_大学


In [23]:
# Ground Truth Translations
GTT_DF = pd.read_csv(GTT_PATH, names = [LANG[0], LANG[1]], sep=' ', header=None)
print('... loaded %s ground truth translations.'%(len(GTT_DF)))
print(GTT_PATH)
print(LANG[0], LANG[1])

... loaded 35354 ground truth translations.
/home/miwamoto/W266-Fall-2017-Final-Project/BaselineModels/data/ground_truth_translations/en-ja-clean.csv
en ja


In [24]:
# Evaluation Words (for reporting recall)
eval_words = [w for w in get_common_words(vocab) if w.startswith(LANG[1])]
EVAL_IDS = vocab.to_ids(eval_words)
print('... loaded %s evaluation words.' % (len(EVAL_IDS)))
#print(repr(eval_words[:5]).decode('unicode_escape'))

... loaded 4425 evaluation words.


# Method 1: Random Translations

### Initialize the model

In [25]:
from models import BiW2V_random

# create model
model_1 = BiW2V_random(bilingual_dict = translations,
                       vocab = vocab, H = EMBEDDING_SIZE)

# intialize TF graphs
model_1.BuildCoreGraph()
model_1.BuildTrainingGraph()
model_1.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [26]:
# fresh data generator
DATA_GENERATOR = batch_generator(raw_data, vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)

In [27]:
# train
start = time.time()
model_1.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = ALPHA)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(20003,) dtype=float32_ref>
... Starting Training
... STEP 0 : Average Loss : 0.0034091506958
   [en_the] closest:  en_odyssey, en_ram, en_saturday, ja_早朝, en_beaver, ja_岳, en_feast, ja_朝日新聞社,
   [en_last] closest:  ja_派遣, ja_行ける, ja_様, ja_渡, ja_湯, en_duncan, ja_市区, en_implemented,
   [ja_月] closest:  ja_ホームラン, ja_標識, en_qualities, en_wartime, en_brilliant, en_real-time, ja_後述, ja_同調,
   [ja_日本] closest:  ja_セクタ, en_wheat, en_designed, en_disaster, ja_掴む, en_hello, ja_ふさわしい, en_gym,
... STEP 5000 : Average Loss : 4.80178324621
... STEP 10000 : Average Loss : 4.42765667364
   [en_the] closest:  en_a, en_and, en_to, en_feast, ja_朝日新聞社, ja_早朝, ja_つば, en_window,
   [en_last] closest:  ja_派遣, ja_行ける, ja_様, ja_湯, ja_渡, en_implemented, en_unknown, en_duncan,


### Save the Embeddings.

In [28]:
# context 
filename = SAVE_TO + '/en_ja_rand_500K_V_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_1.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

# word
filename = SAVE_TO + '/en_ja_rand_500K_U_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_1.word_embeddings, f, pickle.HIGHEST_PROTOCOL)

### Evaluation

In [29]:
model_1.evaluate(source_lang = LANG[0], 
                 target_lang = LANG[1], 
                 gtt = GTT_DF, 
                 sample = EVAL_IDS,
                 top_k = 5,
                 verbose = True)

... Model Initialized
sim shape (4425, 20003)
vocab size 20003
word ja_歩兵
half of vocab 10003
word ja_経済
half of vocab 10004
word ja_カバー
half of vocab 10005
word ja_劣勢
half of vocab 10006
word ja_選択肢
half of vocab 10007
word ja_支部
half of vocab 10008
word ja_確認
half of vocab 10009
word ja_割る
half of vocab 10010
word ja_知的
half of vocab 10011
word ja_境界
half of vocab 10012
word ja_オレンジ
half of vocab 10013
word ja_書体
half of vocab 10014
word ja_急性
half of vocab 10015
word ja_トルコ
half of vocab 10016
word ja_事項
half of vocab 10017
word ja_ポンプ
half of vocab 10018
word ja_コンサルティング
half of vocab 10019
word ja_鉱物
half of vocab 10020
word ja_組み立て
half of vocab 10021
word ja_両親
half of vocab 10022
word ja_原理
half of vocab 10023
word ja_筋肉
half of vocab 10024
word ja_有料
half of vocab 10025
word ja_チューブ
half of vocab 10026
word ja_フォト
half of vocab 10027
word ja_入る
half of vocab 10028
word ja_条項
half of vocab 10029
word ja_浪人
half of vocab 10030
word ja_フォー
half of vocab 10031
word ja_奈良
half of v

word ja_愛人
half of vocab 10261
word ja_南極
half of vocab 10262
word ja_独身
half of vocab 10263
word ja_投稿
half of vocab 10264
word ja_怪物
half of vocab 10265
word ja_ゲット
half of vocab 10266
word ja_北西
half of vocab 10267
word ja_打者
half of vocab 10268
word ja_指数
half of vocab 10269
word ja_地下鉄
half of vocab 10270
word ja_保育
half of vocab 10271
word ja_吸収
half of vocab 10272
word ja_エリザベス
half of vocab 10273
word ja_翻訳
half of vocab 10274
word ja_ルイ
half of vocab 10275
word ja_ルカ
half of vocab 10276
word ja_独占
half of vocab 10277
word ja_水平
half of vocab 10278
word ja_タウン
half of vocab 10279
word ja_時折
half of vocab 10280
word ja_速い
half of vocab 10281
word ja_フォーク
half of vocab 10282
word ja_貨物
half of vocab 10283
word ja_考古学
half of vocab 10284
word ja_不便
half of vocab 10285
word ja_センス
half of vocab 10286
word ja_市場
half of vocab 10287
word ja_会員
half of vocab 10288
word ja_結晶
half of vocab 10289
word ja_却下
half of vocab 10290
word ja_タイムズ
half of vocab 10291
word ja_夕方
half of vocab 10

word ja_譲渡
half of vocab 10525
word ja_石橋
half of vocab 10526
word ja_整数
half of vocab 10527
word ja_レオ
half of vocab 10528
word ja_登山
half of vocab 10529
word ja_バンド
half of vocab 10530
word ja_池袋
half of vocab 10531
word ja_質量
half of vocab 10532
word ja_振る
half of vocab 10533
word ja_同志
half of vocab 10534
word ja_現場
half of vocab 10535
word ja_ステンレス
half of vocab 10536
word ja_改定
half of vocab 10537
word ja_十分
half of vocab 10538
word ja_定員
half of vocab 10539
word ja_ケーブル
half of vocab 10540
word ja_仲間
half of vocab 10541
word ja_公国
half of vocab 10542
word ja_経費
half of vocab 10543
word ja_切り離す
half of vocab 10544
word ja_速度
half of vocab 10545
word ja_天国
half of vocab 10546
word ja_叔父
half of vocab 10547
word ja_テスト
half of vocab 10548
word ja_灰色
half of vocab 10549
word ja_思考
half of vocab 10550
word ja_中心
half of vocab 10551
word ja_サイド
half of vocab 10552
word ja_サイト
half of vocab 10553
word ja_野望
half of vocab 10554
word ja_御所
half of vocab 10555
word ja_学科
half of vocab 105

word ja_アイヌ
half of vocab 10787
word ja_根拠
half of vocab 10788
word ja_努力
half of vocab 10789
word ja_路線
half of vocab 10790
word ja_担保
half of vocab 10791
word ja_報告
half of vocab 10792
word ja_再開
half of vocab 10793
word ja_意見
half of vocab 10794
word ja_ウッド
half of vocab 10795
word ja_コロンビア
half of vocab 10796
word ja_もっと
half of vocab 10797
word ja_派手
half of vocab 10798
word ja_提出
half of vocab 10799
word ja_成功
half of vocab 10800
word ja_充実
half of vocab 10801
word ja_うむ
half of vocab 10802
word ja_船団
half of vocab 10803
word ja_想い
half of vocab 10804
word ja_分離
half of vocab 10805
word ja_シリーズ
half of vocab 10806
word ja_不具合
half of vocab 10807
word ja_封鎖
half of vocab 10808
word ja_国名
half of vocab 10809
word ja_意識
half of vocab 10810
word ja_総長
half of vocab 10811
word ja_クルーズ
half of vocab 10812
word ja_事例
half of vocab 10813
word ja_遺物
half of vocab 10814
word ja_カウンタ
half of vocab 10815
word ja_予定
half of vocab 10816
word ja_蒸気
half of vocab 10817
word ja_ロビン
half of vocab 

word ja_衝突
half of vocab 11049
word ja_バーン
half of vocab 11050
word ja_マジック
half of vocab 11051
word ja_増加
half of vocab 11052
word ja_リニューアル
half of vocab 11053
word ja_数学
half of vocab 11054
word ja_アルゴリズム
half of vocab 11055
word ja_数字
half of vocab 11056
word ja_天井
half of vocab 11057
word ja_改良
half of vocab 11058
word ja_プロフィール
half of vocab 11059
word ja_口径
half of vocab 11060
word ja_手数料
half of vocab 11061
word ja_工場
half of vocab 11062
word ja_シャツ
half of vocab 11063
word ja_布告
half of vocab 11064
word ja_驚異
half of vocab 11065
word ja_悪魔
half of vocab 11066
word ja_引退
half of vocab 11067
word ja_防止
half of vocab 11068
word ja_外伝
half of vocab 11069
word ja_メカ
half of vocab 11070
word ja_テキサス
half of vocab 11071
word ja_母方
half of vocab 11072
word ja_快適
half of vocab 11073
word ja_ステップ
half of vocab 11074
word ja_旅客
half of vocab 11075
word ja_自伝
half of vocab 11076
word ja_守護
half of vocab 11077
word ja_比率
half of vocab 11078
word ja_台地
half of vocab 11079
word ja_逮捕
half of

word ja_人物
half of vocab 11317
word ja_上方
half of vocab 11318
word ja_グランド
half of vocab 11319
word ja_ディスプレイ
half of vocab 11320
word ja_会う
half of vocab 11321
word ja_方法
half of vocab 11322
word ja_難波
half of vocab 11323
word ja_回廊
half of vocab 11324
word ja_ヘッド
half of vocab 11325
word ja_ドット
half of vocab 11326
word ja_血管
half of vocab 11327
word ja_バレー
half of vocab 11328
word ja_勉強
half of vocab 11329
word ja_選任
half of vocab 11330
word ja_運営
half of vocab 11331
word ja_デバイス
half of vocab 11332
word ja_本堂
half of vocab 11333
word ja_人気
half of vocab 11334
word ja_ダグラス
half of vocab 11335
word ja_切り替え
half of vocab 11336
word ja_ビタミン
half of vocab 11337
word ja_進化
half of vocab 11338
word ja_待遇
half of vocab 11339
word ja_エラー
half of vocab 11340
word ja_命題
half of vocab 11341
word ja_機密
half of vocab 11342
word ja_編纂
half of vocab 11343
word ja_レコーディング
half of vocab 11344
word ja_医師
half of vocab 11345
word ja_マドリード
half of vocab 11346
word ja_スター
half of vocab 11347
word ja_娯楽
h

word ja_認定
half of vocab 11582
word ja_生い立ち
half of vocab 11583
word ja_ブック
half of vocab 11584
word ja_画像
half of vocab 11585
word ja_スマート
half of vocab 11586
word ja_売上
half of vocab 11587
word ja_司祭
half of vocab 11588
word ja_語源
half of vocab 11589
word ja_ファーム
half of vocab 11590
word ja_人柄
half of vocab 11591
word ja_通訳
half of vocab 11592
word ja_ベン
half of vocab 11593
word ja_親戚
half of vocab 11594
word ja_配下
half of vocab 11595
word ja_ベラ
half of vocab 11596
word ja_ベル
half of vocab 11597
word ja_学校
half of vocab 11598
word ja_幕府
half of vocab 11599
word ja_数値
half of vocab 11600
word ja_決済
half of vocab 11601
word ja_屋内
half of vocab 11602
word ja_土器
half of vocab 11603
word ja_ケンブリッジ
half of vocab 11604
word ja_ママ
half of vocab 11605
word ja_因果
half of vocab 11606
word ja_小銃
half of vocab 11607
word ja_マリ
half of vocab 11608
word ja_マン
half of vocab 11609
word ja_水曜
half of vocab 11610
word ja_攻撃
half of vocab 11611
word ja_ヒップ
half of vocab 11612
word ja_ヒット
half of vocab 1

word ja_レンジ
half of vocab 11842
word ja_半分
half of vocab 11843
word ja_ゴム
half of vocab 11844
word ja_削除
half of vocab 11845
word ja_つながり
half of vocab 11846
word ja_プラハ
half of vocab 11847
word ja_ミス
half of vocab 11848
word ja_拡大
half of vocab 11849
word ja_脚本
half of vocab 11850
word ja_豊橋
half of vocab 11851
word ja_二つ
half of vocab 11852
word ja_アトランタ
half of vocab 11853
word ja_騒音
half of vocab 11854
word ja_防衛
half of vocab 11855
word ja_プラン
half of vocab 11856
word ja_フライ
half of vocab 11857
word ja_碑文
half of vocab 11858
word ja_打撃
half of vocab 11859
word ja_アウグスト
half of vocab 11860
word ja_命名
half of vocab 11861
word ja_真実
half of vocab 11862
word ja_受賞
half of vocab 11863
word ja_地元
half of vocab 11864
word ja_ボート
half of vocab 11865
word ja_ベスト
half of vocab 11866
word ja_告白
half of vocab 11867
word ja_教職員
half of vocab 11868
word ja_告発
half of vocab 11869
word ja_天才
half of vocab 11870
word ja_発明
half of vocab 11871
word ja_パルス
half of vocab 11872
word ja_コミック
half of vo

word ja_混血
half of vocab 12108
word ja_乗り換え
half of vocab 12109
word ja_発足
half of vocab 12110
word ja_定理
half of vocab 12111
word ja_バス
half of vocab 12112
word ja_バイ
half of vocab 12113
word ja_鋳造
half of vocab 12114
word ja_金銭
half of vocab 12115
word ja_化学
half of vocab 12116
word ja_最強
half of vocab 12117
word ja_ジェームズ
half of vocab 12118
word ja_バカ
half of vocab 12119
word ja_進む
half of vocab 12120
word ja_砂漠
half of vocab 12121
word ja_事業
half of vocab 12122
word ja_区画
half of vocab 12123
word ja_新幹線
half of vocab 12124
word ja_凍結
half of vocab 12125
word ja_モデル
half of vocab 12126
word ja_トラブル
half of vocab 12127
word ja_旗艦
half of vocab 12128
word ja_議長
half of vocab 12129
word ja_公爵
half of vocab 12130
word ja_主演
half of vocab 12131
word ja_最終
half of vocab 12132
word ja_合金
half of vocab 12133
word ja_編集
half of vocab 12134
word ja_超人
half of vocab 12135
word ja_与える
half of vocab 12136
word ja_座席
half of vocab 12137
word ja_変形
half of vocab 12138
word ja_実験
half of vocab 1213

word ja_洪水
half of vocab 12370
word ja_アプローチ
half of vocab 12371
word ja_マウント
half of vocab 12372
word ja_配置
half of vocab 12373
word ja_反逆
half of vocab 12374
word ja_花園
half of vocab 12375
word ja_ソフト
half of vocab 12376
word ja_デモ
half of vocab 12377
word ja_ついに
half of vocab 12378
word ja_武道
half of vocab 12379
word ja_学園
half of vocab 12380
word ja_伝導
half of vocab 12381
word ja_十字軍
half of vocab 12382
word ja_固定
half of vocab 12383
word ja_透明
half of vocab 12384
word ja_上回る
half of vocab 12385
word ja_ソビエト
half of vocab 12386
word ja_本質
half of vocab 12387
word ja_物流
half of vocab 12388
word ja_感度
half of vocab 12389
word ja_鉱業
half of vocab 12390
word ja_橋梁
half of vocab 12391
word ja_準拠
half of vocab 12392
word ja_聴覚
half of vocab 12393
word ja_食事
half of vocab 12394
word ja_ユニバーサル
half of vocab 12395
word ja_プリンス
half of vocab 12396
word ja_場所
half of vocab 12397
word ja_プール
half of vocab 12398
word ja_到着
half of vocab 12399
word ja_クリスマス
half of vocab 12400
word ja_会社
half of

word ja_伸びる
half of vocab 12634
word ja_ハリケーン
half of vocab 12635
word ja_マンガ
half of vocab 12636
word ja_違い
half of vocab 12637
word ja_上昇
half of vocab 12638
word ja_生徒
half of vocab 12639
word ja_ショック
half of vocab 12640
word ja_大気
half of vocab 12641
word ja_レポート
half of vocab 12642
word ja_玩具
half of vocab 12643
word ja_移住
half of vocab 12644
word ja_複雑
half of vocab 12645
word ja_マーケティング
half of vocab 12646
word ja_批判
half of vocab 12647
word ja_基準
half of vocab 12648
word ja_俳句
half of vocab 12649
word ja_言葉
half of vocab 12650
word ja_科学
half of vocab 12651
word ja_論争
half of vocab 12652
word ja_台湾
half of vocab 12653
word ja_受け入れる
half of vocab 12654
word ja_発達
half of vocab 12655
word ja_合併
half of vocab 12656
word ja_食べる
half of vocab 12657
word ja_コンテンツ
half of vocab 12658
word ja_リンカーン
half of vocab 12659
word ja_処罰
half of vocab 12660
word ja_反響
half of vocab 12661
word ja_テクニック
half of vocab 12662
word ja_発光
half of vocab 12663
word ja_共通
half of vocab 12664
word ja_請求
h

word ja_指揮
half of vocab 12894
word ja_推奨
half of vocab 12895
word ja_被る
half of vocab 12896
word ja_邸宅
half of vocab 12897
word ja_困難
half of vocab 12898
word ja_ランク
half of vocab 12899
word ja_動詞
half of vocab 12900
word ja_辞書
half of vocab 12901
word ja_マレーシア
half of vocab 12902
word ja_マップ
half of vocab 12903
word ja_戦車
half of vocab 12904
word ja_写本
half of vocab 12905
word ja_高度
half of vocab 12906
word ja_日本人
half of vocab 12907
word ja_原点
half of vocab 12908
word ja_捕食
half of vocab 12909
word ja_カウント
half of vocab 12910
word ja_黄色
half of vocab 12911
word ja_予防
half of vocab 12912
word ja_フル
half of vocab 12913
word ja_熱帯
half of vocab 12914
word ja_最上
half of vocab 12915
word ja_便利
half of vocab 12916
word ja_標本
half of vocab 12917
word ja_覚醒
half of vocab 12918
word ja_中国語
half of vocab 12919
word ja_プロセッサ
half of vocab 12920
word ja_呼称
half of vocab 12921
word ja_愛する
half of vocab 12922
word ja_制約
half of vocab 12923
word ja_落語
half of vocab 12924
word ja_東部
half of vocab 1

word ja_セルフ
half of vocab 13155
word ja_大成
half of vocab 13156
word ja_大西洋
half of vocab 13157
word ja_近隣
half of vocab 13158
word ja_解読
half of vocab 13159
word ja_堤防
half of vocab 13160
word ja_クルー
half of vocab 13161
word ja_刑務所
half of vocab 13162
word ja_ウー
half of vocab 13163
word ja_税金
half of vocab 13164
word ja_虐殺
half of vocab 13165
word ja_飲む
half of vocab 13166
word ja_ラヴ
half of vocab 13167
word ja_事前
half of vocab 13168
word ja_戦場
half of vocab 13169
word ja_狭い
half of vocab 13170
word ja_頭部
half of vocab 13171
word ja_裁定
half of vocab 13172
word ja_ターゲット
half of vocab 13173
word ja_教会
half of vocab 13174
word ja_練習
half of vocab 13175
word ja_権利
half of vocab 13176
word ja_ボード
half of vocab 13177
word ja_関節
half of vocab 13178
word ja_ケベック
half of vocab 13179
word ja_ジェット
half of vocab 13180
word ja_菩薩
half of vocab 13181
word ja_影響
half of vocab 13182
word ja_レーシング
half of vocab 13183
word ja_チェック
half of vocab 13184
word ja_行列
half of vocab 13185
word ja_宰相
half of voc

word ja_発見
half of vocab 13422
word ja_鋭い
half of vocab 13423
word ja_一度
half of vocab 13424
word ja_世俗
half of vocab 13425
word ja_皮肉
half of vocab 13426
word ja_力学
half of vocab 13427
word ja_未亡人
half of vocab 13428
word ja_ユリウス
half of vocab 13429
word ja_モスクワ
half of vocab 13430
word ja_カルロス
half of vocab 13431
word ja_感動
half of vocab 13432
word ja_起動
half of vocab 13433
word ja_取り組み
half of vocab 13434
word ja_利点
half of vocab 13435
word ja_始める
half of vocab 13436
word ja_免許
half of vocab 13437
word ja_神経
half of vocab 13438
word ja_指標
half of vocab 13439
word ja_リボン
half of vocab 13440
word ja_プロダクション
half of vocab 13441
word ja_ドライブ
half of vocab 13442
word ja_ドライバ
half of vocab 13443
word ja_省略
half of vocab 13444
word ja_歓迎
half of vocab 13445
word ja_ロイヤル
half of vocab 13446
word ja_ムスリム
half of vocab 13447
word ja_栽培
half of vocab 13448
word ja_マスト
half of vocab 13449
word ja_コース
half of vocab 13450
word ja_クロス
half of vocab 13451
word ja_レーダ
half of vocab 13452
word ja_周辺


word ja_リー
half of vocab 13683
word ja_メイン
half of vocab 13684
word ja_追加
half of vocab 13685
word ja_フロリダ
half of vocab 13686
word ja_メイド
half of vocab 13687
word ja_神殿
half of vocab 13688
word ja_リマ
half of vocab 13689
word ja_トマス
half of vocab 13690
word ja_ファミコン
half of vocab 13691
word ja_原告
half of vocab 13692
word ja_ゲル
half of vocab 13693
word ja_キス
half of vocab 13694
word ja_空気
half of vocab 13695
word ja_産業
half of vocab 13696
word ja_感謝
half of vocab 13697
word ja_手続
half of vocab 13698
word ja_職人
half of vocab 13699
word ja_剥奪
half of vocab 13700
word ja_広い
half of vocab 13701
word ja_賃金
half of vocab 13702
word ja_ドック
half of vocab 13703
word ja_受け入れ
half of vocab 13704
word ja_キリスト教
half of vocab 13705
word ja_理論
half of vocab 13706
word ja_配達
half of vocab 13707
word ja_狩り
half of vocab 13708
word ja_シカゴ
half of vocab 13709
word ja_入札
half of vocab 13710
word ja_ウェブサイト
half of vocab 13711
word ja_併合
half of vocab 13712
word ja_バレエ
half of vocab 13713
word ja_営利
half of 

word ja_市町村
half of vocab 13953
word ja_ボストン
half of vocab 13954
word ja_活動
half of vocab 13955
word ja_メン
half of vocab 13956
word ja_美人
half of vocab 13957
word ja_楽天
half of vocab 13958
word ja_テナント
half of vocab 13959
word ja_解任
half of vocab 13960
word ja_メモ
half of vocab 13961
word ja_ショッピング
half of vocab 13962
word ja_統計
half of vocab 13963
word ja_パーソナリティ
half of vocab 13964
word ja_財政
half of vocab 13965
word ja_有罪
half of vocab 13966
word ja_ドラフト
half of vocab 13967
word ja_ジョンソン
half of vocab 13968
word ja_両面
half of vocab 13969
word ja_ダン
half of vocab 13970
word ja_マイ
half of vocab 13971
word ja_レンタル
half of vocab 13972
word ja_ペット
half of vocab 13973
word ja_ハッスル
half of vocab 13974
word ja_マス
half of vocab 13975
word ja_ノーフォーク
half of vocab 13976
word ja_アマチュア
half of vocab 13977
word ja_骨折
half of vocab 13978
word ja_兵士
half of vocab 13979
word ja_寄稿
half of vocab 13980
word ja_概略
half of vocab 13981
word ja_バンコク
half of vocab 13982
word ja_別れ
half of vocab 13983
word j

word ja_ミシガン
half of vocab 14217
word ja_人口
half of vocab 14218
word ja_瀬戸
half of vocab 14219
word ja_不快
half of vocab 14220
word ja_プラットフォーム
half of vocab 14221
word ja_パンツ
half of vocab 14222
word ja_パンチ
half of vocab 14223
word ja_未遂
half of vocab 14224
word ja_赤外線
half of vocab 14225
word ja_異端
half of vocab 14226
word ja_ブルガリア
half of vocab 14227
word ja_クラウン
half of vocab 14228
word ja_復活
half of vocab 14229
word ja_ミー
half of vocab 14230
word ja_不在
half of vocab 14231
word ja_松竹
half of vocab 14232
word ja_ミル
half of vocab 14233
word ja_遺伝
half of vocab 14234
word ja_オープニング
half of vocab 14235
word ja_動機
half of vocab 14236
word ja_コソボ
half of vocab 14237
word ja_嫌い
half of vocab 14238
word ja_プラザ
half of vocab 14239
word ja_プラグ
half of vocab 14240
word ja_ラーメン
half of vocab 14241
word ja_ミニ
half of vocab 14242
word ja_プラス
half of vocab 14243
word ja_陸軍
half of vocab 14244
word ja_体質
half of vocab 14245
word ja_ハワイ
half of vocab 14246
word ja_精度
half of vocab 14247
word ja_アルメニ

{'ja_\xe6\xad\xa9\xe5\x85\xb5': 0}

In [30]:
evaluate(self, source_lang, target_lang, gtt, sample, verbose=True)

NameError: name 'evaluate' is not defined

# Method 2: Most Common Target Translation

### Initialize the model

In [31]:
from models import BiW2V_mle

# create model
model_2 = BiW2V_mle(bilingual_dict = translations,
                       vocab = vocab, H = EMBEDDING_SIZE)

# intialize TF graphs
model_2.BuildCoreGraph()
model_2.BuildTrainingGraph()
model_2.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [32]:
# fresh data generator
DATA_GENERATOR = batch_generator(raw_data, vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)

In [33]:
# train
start = time.time()
model_2.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = ALPHA)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(20003,) dtype=float32_ref>
... Starting Training
... STEP 0 : Average Loss : 0.00321937179565
   [en_the] closest:  ja_re, en_16, ja_立教, ja_居, en_buddhist, ja_野生, en_granddaughter, ja_余地,
   [en_last] closest:  en_labrador, en_hal, ja_良, en_imam, en_cerebral, en_aging, en_bite, en_detailed,
   [ja_月] closest:  ja_遭遇, en_russell, ja_社会党, en_getting, en_maurice, ja_向上, ja_npo, ja_清朝,
   [ja_日本] closest:  en_melting, ja_院, en_pupil, en_line, ja_尹, ja_出典, en_parody, en_striker,
... STEP 5000 : Average Loss : 5.10440200653
... STEP 10000 : Average Loss : 4.62019368992
   [en_the] closest:  en_a, en_16, ja_余地, ja_野生, en_particularly, ja_兼ねる, en_life, ja_居,
   [en_last] closest:  en_labrador, en_hal, ja_良, en_imam, en_bite, en_aging, en_cerebral, en_forget,


### Save the Embeddings.

In [34]:
# context 
filename = SAVE_TO + '/en_ja_mle_50K_V_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_2.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

# word
filename = SAVE_TO + '/en_ja_mle_50K_U_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_2.word_embeddings, f, pickle.HIGHEST_PROTOCOL)

# Method 3: Closest Translation

### Initialize the model

In [35]:
from models import BiW2V_nn

# create model
model_3 = BiW2V_nn(bilingual_dict = translations,
                   vocab = vocab, H = EMBEDDING_SIZE)

# intialize TF graphs
model_3.BuildCoreGraph()
model_3.BuildTrainingGraph()
model_3.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [36]:
# fresh data generator
DATA_GENERATOR = batch_generator(raw_data, vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)

In [37]:
# train
nBATCHES = 5000 # Takes too long w/ nn so we'll only do 5K
start = time.time()
model_3.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = ALPHA)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(20003,) dtype=float32_ref>
... Starting Training
... STEP 0 : Average Loss : 0.0283244400024
   [en_the] closest:  ja_飽和, ja_徹底的, ja_改良, ja_押し出す, en_comprising, ja_扱う, en_owner, ja_聖堂,
   [en_last] closest:  ja_外務省, ja_地帯, en_garbage, ja_民事, en_dc, en_cal, en_evidence, ja_クイズ,
   [ja_月] closest:  en_enjoy, en_box, ja_ピース, en_canal, ja_テキスト, en_olive, ja_上手い, en_instruction,
   [ja_日本] closest:  ja_恐れ, en_invalid, en_nobility, ja_クライマックス, en_evaluate, ja_派手, en_march, ja_全体,
... STEP 500 : Average Loss : 6.28931427908
... STEP 1000 : Average Loss : 5.72066562939
   [en_the] closest:  ja_改良, ja_徹底的, ja_気圧, ja_飽和, ja_聖堂, ja_扱う, ja_革命, en_comprising,
   [en_last] closest:  ja_外務省, ja_地帯, en_garbage, ja_民事, en_dc, en_cal, en_evidence, ja_クイズ,
   [ja_月] clo

### Save the Embeddings.

In [40]:
# context 
filename = SAVE_TO + '/en_ja_nn_5K_V_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_3.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

# word
filename = SAVE_TO + '/en_ja_nn_5K_U_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_3.word_embeddings, f, pickle.HIGHEST_PROTOCOL)