# English - Japanese Embeddings (3 versions)
`w266 Final Project: Crosslingual Word Embeddings`

Instead of traning on randomly substituted words, here we'll choose the translation that is closest to the context embedding vector.

# Notebook Setup

In [41]:
# general imports
from __future__ import print_function
import time
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

# tell matplotlib not to open a new window
%matplotlib inline

# autoreload modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


__Base Paths__

In [42]:
# Maya's paths
#BASE = '/home/mmillervedam/Data'
#PROJ = '/home/mmillervedam/ProjectRepo'

# Mona;s paths
BASE = '/home/miwamoto/Data'
PROJ = '/home/miwamoto/W266-Fall-2017-Final-Project'

GTT_BASE = PROJ + '/BaselineModels/data/ground_truth_translations/'

# directory to save pickled embeddings
SAVE_TO = BASE + '/embeddings'

__Globals__ - _the parameters below fully determine all 3 models in this NB_

In [56]:
# Data
LANG = ('en','ja')
FULL_TEXT = "/home/miwamoto/en_ja_shuf.txt"
VOCAB_INDEX = BASE + '/vocab/en_ja_small.pkl'
PANLEX = BASE + '/panlex/en_ja_dict.pkl'
GTT_PATH = GTT_BASE + "%s-%s-clean.csv" % (LANG[0], LANG[1])

# Model
EMBEDDING_SIZE = 200

# Training
nBATCHES = 50000 # <<< 1 epoch with our 1 million sentence corpus
BATCH_SIZE = 48
WINDOW_SIZE = 4
MAX_EPOCHS = 5 # fail safe
ALPHA = 0.5 # authors use a much smaller learning rate but train longer

# Load Data

In [57]:
from parsing import Corpus, BilingualVocabulary, batch_generator, get_common_words

In [58]:
# load corpus
raw_data = Corpus(FULL_TEXT)

In [59]:
# load panlex dictionary
with open(PANLEX,'rb') as f:
    translations = pickle.load(f)

In [60]:
# load vocabulary
vocab = BilingualVocabulary([], languages = LANG)
with open(VOCAB_INDEX,'rb') as f:
    vocab.load_from_index(pickle.load(f))

In [61]:
# confirmations
print('... loaded %s panlex translations'%(len(translations)))
print('... loaded %s word %s vocabulary'%(vocab.size,vocab.language))

... loaded 634705 panlex translations
... loaded 20003 word ('en', 'ja') vocabulary


In [22]:
# Validation Words (for training printout)
TEST_WORDS = vocab.to_ids(['en_the','en_last', 'ja_月', 'ja_日本'])
print('... test word ids:', TEST_WORDS)
for i in range(9990,10020):
    print(vocab.index[i])
#print(vocab.wordset)

... test word ids: [3, 228, 10004, 10012]
en_exemption
en_bohemian
en_walnut
en_ljubljana
en_timor
en_venom
en_scriptures
en_tariff
en_penetration
en_pedal
en_transmissions
en_fluent
en_sexes
ja_年
ja_月
ja_日
ja_的
ja_3
ja_第
ja_人
ja_者
ja_後
ja_日本
ja_行う
ja_中
ja_一
ja_現在
ja_時
ja_化
ja_大学


In [62]:
# Ground Truth Translations
GTT_DF = pd.read_csv(GTT_PATH, names = [LANG[1], LANG[0]], sep=' ', header=None)
print('... loaded %s ground truth translations.'%(len(GTT_DF)))
print(GTT_PATH)
print(LANG[0], LANG[1])

... loaded 35354 ground truth translations.
/home/miwamoto/W266-Fall-2017-Final-Project/BaselineModels/data/ground_truth_translations/en-ja-clean.csv
en ja


In [63]:
# Evaluation Words (for reporting recall)
eval_words = [w for w in get_common_words(vocab) if w.startswith(LANG[1])]
EVAL_IDS = vocab.to_ids(eval_words)
print('... loaded %s evaluation words.' % (len(EVAL_IDS)))
#print(repr(eval_words[:5]).decode('unicode_escape'))

... loaded 4425 evaluation words.


# Method 1: Random Translations

### Initialize the model

In [25]:
from models import BiW2V_random

# create model
model_1 = BiW2V_random(bilingual_dict = translations,
                       vocab = vocab, H = EMBEDDING_SIZE)

# intialize TF graphs
model_1.BuildCoreGraph()
model_1.BuildTrainingGraph()
model_1.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [26]:
# fresh data generator
DATA_GENERATOR = batch_generator(raw_data, vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)

In [27]:
# train
start = time.time()
model_1.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = ALPHA)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(20003,) dtype=float32_ref>
... Starting Training
... STEP 0 : Average Loss : 0.0034091506958
   [en_the] closest:  en_odyssey, en_ram, en_saturday, ja_早朝, en_beaver, ja_岳, en_feast, ja_朝日新聞社,
   [en_last] closest:  ja_派遣, ja_行ける, ja_様, ja_渡, ja_湯, en_duncan, ja_市区, en_implemented,
   [ja_月] closest:  ja_ホームラン, ja_標識, en_qualities, en_wartime, en_brilliant, en_real-time, ja_後述, ja_同調,
   [ja_日本] closest:  ja_セクタ, en_wheat, en_designed, en_disaster, ja_掴む, en_hello, ja_ふさわしい, en_gym,
... STEP 5000 : Average Loss : 4.80178324621
... STEP 10000 : Average Loss : 4.42765667364
   [en_the] closest:  en_a, en_and, en_to, en_feast, ja_朝日新聞社, ja_早朝, ja_つば, en_window,
   [en_last] closest:  ja_派遣, ja_行ける, ja_様, ja_湯, ja_渡, en_implemented, en_unknown, en_duncan,


### Save the Embeddings.

In [28]:
# context 
filename = SAVE_TO + '/en_ja_rand_500K_V_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_1.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

# word
filename = SAVE_TO + '/en_ja_rand_500K_U_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_1.word_embeddings, f, pickle.HIGHEST_PROTOCOL)

### Evaluation

In [64]:
# load saved embeddings
with open(SAVE_TO + '/en_ja_rand_500K_V_dec19.pkl','rb') as f:
    C_embedding1 = pickle.load(f)

In [65]:
# sanity checks
print('... C shape:', C_embedding1.shape)
print('... eval IDs should be > 10003:', EVAL_IDS[:5])
print('... number to eval:', len(EVAL_IDS))
print('... ground truth source language:', GTT_DF.columns[0])

... C shape: (20003, 200)
... eval IDs should be > 10003: [11818, 10216, 12063, 19195, 16293]
... number to eval: 4425
... ground truth source language: ja


__Bilingual Induction Task__

In [66]:
from models import evaluateBLI

In [67]:
src_nbrs, tgt_nbrs = evaluateBLI(C_embedding1, vocab, GTT_DF, 
                                 EVAL_IDS, top_k = 10, verbose=True)

... Evaluating 4425 'ja' Ground Truth Translations
... TF graph created for BiW2V model.
... TF graph created for BiW2V validation.
... finding neighbors...
... Done. Total successful translation rate: 0 (0 / 4425)


In [68]:
# visual check
for wrd_id in TEST_WORDS:
    try:
        idx = EVAL_IDS.index(wrd_id)
    except:
        continue
    synon = vocab.to_words(src_nbrs[idx])
    trans = vocab.to_words(tgt_nbrs[idx])
    print(vocab.to_words([wrd_id])[0],":")
    print(">>>", synon)
    print(">>>", trans)

ja_日本 :
>>> ['ja_\xe6\x97\xa5\xe6\x9c\xac', 'ja_\xe8\x80\x85', 'ja_\xe3\x82\xbb\xe3\x82\xaf\xe3\x82\xbf', 'ja_\xe8\xa1\x8c\xe3\x81\x86', 'ja_\xe6\x8e\xb4\xe3\x82\x80', 'ja_\xe4\xb8\x80', 'ja_\xe3\x81\xb5\xe3\x81\x95\xe3\x82\x8f\xe3\x81\x97\xe3\x81\x84', 'ja_\xe7\x9a\x84', 'ja_\xe8\xbb\x8d', 'ja_\xe7\x9f\xa5\xe4\xba\x8b']
>>> ['en_wheat', 'en_gym', 'en_designed', 'en_hancock', 'en_integrity', 'en_disaster', 'en_plaza', 'en_bud', 'en_beneath', 'en_bo']


# Method 2: Most Common Target Translation

### Initialize the model

In [31]:
from models import BiW2V_mle

# create model
model_2 = BiW2V_mle(bilingual_dict = translations,
                       vocab = vocab, H = EMBEDDING_SIZE)

# intialize TF graphs
model_2.BuildCoreGraph()
model_2.BuildTrainingGraph()
model_2.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [32]:
# fresh data generator
DATA_GENERATOR = batch_generator(raw_data, vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)

In [33]:
# train
start = time.time()
model_2.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = ALPHA)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(20003,) dtype=float32_ref>
... Starting Training
... STEP 0 : Average Loss : 0.00321937179565
   [en_the] closest:  ja_re, en_16, ja_立教, ja_居, en_buddhist, ja_野生, en_granddaughter, ja_余地,
   [en_last] closest:  en_labrador, en_hal, ja_良, en_imam, en_cerebral, en_aging, en_bite, en_detailed,
   [ja_月] closest:  ja_遭遇, en_russell, ja_社会党, en_getting, en_maurice, ja_向上, ja_npo, ja_清朝,
   [ja_日本] closest:  en_melting, ja_院, en_pupil, en_line, ja_尹, ja_出典, en_parody, en_striker,
... STEP 5000 : Average Loss : 5.10440200653
... STEP 10000 : Average Loss : 4.62019368992
   [en_the] closest:  en_a, en_16, ja_余地, ja_野生, en_particularly, ja_兼ねる, en_life, ja_居,
   [en_last] closest:  en_labrador, en_hal, ja_良, en_imam, en_bite, en_aging, en_cerebral, en_forget,


### Save the Embeddings.

In [34]:
# context 
filename = SAVE_TO + '/en_ja_mle_50K_V_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_2.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

# word
filename = SAVE_TO + '/en_ja_mle_50K_U_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_2.word_embeddings, f, pickle.HIGHEST_PROTOCOL)

### Evaluation

In [71]:
# load saved embeddings
with open(SAVE_TO + '/en_ja_mle_50K_V_dec19.pkl','rb') as f:
    C_embedding2 = pickle.load(f)

In [72]:
# sanity checks
print('... C shape:', C_embedding2.shape)
print('... eval IDs should be > 10003:', EVAL_IDS[:5])
print('... number to eval:', len(EVAL_IDS))
print('... ground truth source language:', GTT_DF.columns[0])

... C shape: (20003, 200)
... eval IDs should be > 10003: [11818, 10216, 12063, 19195, 16293]
... number to eval: 4425
... ground truth source language: ja


#### Bilingual Induction Task

In [73]:
from models import evaluateBLI

In [74]:
src_nbrs, tgt_nbrs = evaluateBLI(C_embedding2, vocab, GTT_DF, 
                                 EVAL_IDS, top_k = 10, verbose=True)

... Evaluating 4425 'ja' Ground Truth Translations
... TF graph created for BiW2V model.
... TF graph created for BiW2V validation.
... finding neighbors...
... Done. Total successful translation rate: 0 (0 / 4425)


In [75]:
# visual check
for wrd_id in TEST_WORDS:
    try:
        idx = EVAL_IDS.index(wrd_id)
    except:
        continue
    synon = vocab.to_words(src_nbrs[idx])
    trans = vocab.to_words(tgt_nbrs[idx])
    print(vocab.to_words([wrd_id])[0],":")
    print(">>>", synon)
    print(">>>", trans)

ja_日本 :
>>> ['ja_\xe6\x97\xa5\xe6\x9c\xac', 'ja_\xe5\xb9\xb4', 'ja_\xe7\x9a\x84', 'ja_\xe9\x99\xa2', 'ja_\xe5\xb0\xb9', 'ja_\xe6\x94\xbe\xe9\x80\x81', 'ja_3', 'ja_\xe6\xb0\xb4\xe6\xb3\xb3', 'ja_\xe6\xb3\x95\xe6\x94\xbf\xe5\xa4\xa7\xe5\xad\xa6', 'ja_\xe7\xa0\x94\xe7\xa9\xb6']
>>> ['en_pupil', 'en_parody', 'en_line', 'en_facebook', 'en_melting', 'en_striker', 'en_decrease', 'en_shots', 'en_everyday', 'en_brook']


# Method 3: Closest Translation

### Initialize the model

In [35]:
from models import BiW2V_nn

# create model
model_3 = BiW2V_nn(bilingual_dict = translations,
                   vocab = vocab, H = EMBEDDING_SIZE)

# intialize TF graphs
model_3.BuildCoreGraph()
model_3.BuildTrainingGraph()
model_3.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [36]:
# fresh data generator
DATA_GENERATOR = batch_generator(raw_data, vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)

In [37]:
# train
nBATCHES = 5000 # Takes too long w/ nn so we'll only do 5K
start = time.time()
model_3.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = ALPHA)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(20003,) dtype=float32_ref>
... Starting Training
... STEP 0 : Average Loss : 0.0283244400024
   [en_the] closest:  ja_飽和, ja_徹底的, ja_改良, ja_押し出す, en_comprising, ja_扱う, en_owner, ja_聖堂,
   [en_last] closest:  ja_外務省, ja_地帯, en_garbage, ja_民事, en_dc, en_cal, en_evidence, ja_クイズ,
   [ja_月] closest:  en_enjoy, en_box, ja_ピース, en_canal, ja_テキスト, en_olive, ja_上手い, en_instruction,
   [ja_日本] closest:  ja_恐れ, en_invalid, en_nobility, ja_クライマックス, en_evaluate, ja_派手, en_march, ja_全体,
... STEP 500 : Average Loss : 6.28931427908
... STEP 1000 : Average Loss : 5.72066562939
   [en_the] closest:  ja_改良, ja_徹底的, ja_気圧, ja_飽和, ja_聖堂, ja_扱う, ja_革命, en_comprising,
   [en_last] closest:  ja_外務省, ja_地帯, en_garbage, ja_民事, en_dc, en_cal, en_evidence, ja_クイズ,
   [ja_月] clo

### Save the Embeddings.

In [40]:
# context 
filename = SAVE_TO + '/en_ja_nn_5K_V_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_3.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

# word
filename = SAVE_TO + '/en_ja_nn_5K_U_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_3.word_embeddings, f, pickle.HIGHEST_PROTOCOL)

### Evaluation

In [76]:
# load saved embeddings
with open(SAVE_TO + '/en_ja_nn_5K_V_dec19.pkl','rb') as f:
    C_embedding3 = pickle.load(f)

In [77]:
# sanity checks
print('... C shape:', C_embedding3.shape)
print('... eval IDs should be > 10003:', EVAL_IDS[:5])
print('... number to eval:', len(EVAL_IDS))
print('... ground truth source language:', GTT_DF.columns[0])

... C shape: (20003, 200)
... eval IDs should be > 10003: [11818, 10216, 12063, 19195, 16293]
... number to eval: 4425
... ground truth source language: ja


__Bilingual Induction Task__

In [78]:
from models import evaluateBLI

In [80]:
src_nbrs, tgt_nbrs = evaluateBLI(C_embedding3, vocab, GTT_DF, 
                                 EVAL_IDS, top_k = 10, verbose=True)

... Evaluating 4425 'ja' Ground Truth Translations
... TF graph created for BiW2V model.
... TF graph created for BiW2V validation.
... finding neighbors...
... Done. Total successful translation rate: 0 (0 / 4425)


In [81]:
# visual check
for wrd_id in TEST_WORDS:
    try:
        idx = EVAL_IDS.index(wrd_id)
    except:
        continue
    synon = vocab.to_words(src_nbrs[idx])
    trans = vocab.to_words(tgt_nbrs[idx])
    print(vocab.to_words([wrd_id])[0],":")
    print(">>>", synon)
    print(">>>", trans)

ja_日本 :
>>> ['ja_\xe6\x97\xa5\xe6\x9c\xac', 'ja_\xe6\x81\x90\xe3\x82\x8c', 'ja_\xe3\x82\xaf\xe3\x83\xa9\xe3\x82\xa4\xe3\x83\x9e\xe3\x83\x83\xe3\x82\xaf\xe3\x82\xb9', 'ja_\xe6\xb4\xbe\xe6\x89\x8b', 'ja_\xe5\x85\xa8\xe4\xbd\x93', 'ja_\xe6\x9c\x89\xe6\x95\xb0', 'ja_\xe3\x81\xbe\xe3\x81\xa3\xe3\x81\x9f\xe3\x81\x8f', 'ja_\xe5\xba\x97\xe8\x88\x97', 'ja_\xe6\x9d\xb1\xe6\x98\xa0', 'ja_\xe5\x85\xb1\xe9\xb3\xb4']
>>> ['en_invalid', 'en_nobility', 'en_evaluate', 'en_march', 'en_iceland', 'en_yearly', 'en_minimum', 'en_sea', 'en_cooler', 'en_later']
