# English - Spanish Embeddings (3 versions)
`w266 Final Project: Crosslingual Word Embeddings`

Instead of traning on randomly substituted words, here we'll choose the translation that is closest to the context embedding vector.

# Notebook Setup

In [1]:
# general imports
from __future__ import print_function
import time
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

# tell matplotlib not to open a new window
%matplotlib inline

# autoreload modules
%load_ext autoreload
%autoreload 2

__Base Paths__

In [2]:
BASE = '/home/mmillervedam/Data'
PROJ = '/home/mmillervedam/ProjectRepo'
GTT_BASE = PROJ + '/BaselineModels/data/ground_truth_translations/'
#PROJ = '/Users/mona/OneDrive/repos/final_proj/W266-Fall-2017-Final-Project'

# directory to save pickled embeddings
SAVE_TO = BASE + '/embeddings'

__Globals__ - _the parameters below fully determine all 3 models in this NB_

In [3]:
# Data
LANG = ('en','es')
FULL_TEXT = "/home/miwamoto/en_es_shuf.txt"
VOCAB_INDEX = BASE + '/vocab/en_es_small.pkl'
PANLEX = BASE + '/panlex/en_es_dict.pkl'
GTT_PATH = GTT_BASE + "%s-%s-clean.csv" % (LANG[1], LANG[0])

# Model
EMBEDDING_SIZE = 200

# Training
nBATCHES = 100000 # <<< 1 epoch with our 1 million sentence corpus
BATCH_SIZE = 48
WINDOW_SIZE = 4
MAX_EPOCHS = 5 # fail safe
ALPHA = 0.5 # authors use a much smaller learning rate but train longer

# Load Data

In [4]:
from parsing import Corpus, BilingualVocabulary, batch_generator, get_common_words

In [6]:
# load corpus
raw_data = Corpus(FULL_TEXT)

In [7]:
# load panlex dictionary
with open(PANLEX,'rb') as f:
    translations = pickle.load(f)

In [8]:
# load vocabulary
vocab = BilingualVocabulary([], languages = LANG)
with open(VOCAB_INDEX,'rb') as f:
    vocab.load_from_index(pickle.load(f))

In [9]:
# confirmations
print('... loaded %s panlex translations'%(len(translations)))
print('... loaded %s word %s vocabulary'%(vocab.size,vocab.language))

... loaded 702982 panlex translations
... loaded 20003 word ('en', 'es') vocabulary


In [10]:
# Validation Words (for training printout)
TEST_WORDS = vocab.to_ids(['en_the','en_last', 'es_si', 'es_primero'])
print('... test word ids:', TEST_WORDS)

... test word ids: [3, 235, 10097, 10409]


In [11]:
# Ground Truth Translations
GTT_DF = pd.read_csv(GTT_PATH, names = [LANG[1], LANG[0]], sep=' ', header=None)
print('... loaded %s ground truth translations.'%(len(GTT_DF)))

... loaded 225168 ground truth translations.


In [12]:
# Evaluation Words (for reporting recall)
eval_words = [w for w in get_common_words(vocab) if w.startswith(LANG[1])]
EVAL_IDS = vocab.to_ids(eval_words)
print('... loaded %s evaluation words.' % (len(EVAL_IDS)))

... loaded 9276 evaluation words.


# Method 1: Random Translations

### Initialize the model

In [22]:
from models import BiW2V_random

# create model
model_1 = BiW2V_random(bilingual_dict = translations,
                       vocab = vocab, H = EMBEDDING_SIZE)

# intialize TF graphs
model_1.BuildCoreGraph()
model_1.BuildTrainingGraph()
model_1.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [23]:
# fresh data generator
DATA_GENERATOR = batch_generator(raw_data, vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)

In [24]:
# train
start = time.time()
model_1.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = ALPHA)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(20003,) dtype=float32_ref>
... Starting Training
... STEP 0 : Average Loss : 0.00130660667419
   [en_the] closest:  en_pull, es_censo, es_matorrales, es_grave, es_apolo, es_liberalismo, en_resignation, en_scheme,
   [en_last] closest:  en_dye, en_solving, es_quebec, en_doctor, en_preceding, en_grazing, en_halt, es_protagonistas,
   [es_si] closest:  en_elements, en_fewer, en_bowling, es_cisneros, en_attain, en_singular, en_carrying, es_empresas,
   [es_primero] closest:  en_macedonian, en_dating, en_afl, en_highest, en_freud, en_dense, en_pits, es_atlántico,
... STEP 10000 : Average Loss : 4.3614641548
... STEP 20000 : Average Loss : 4.03525338483
   [en_the] closest:  en_a, es_grave, en_pull, es_liberalismo, es_censo, en_notorious, es_probabilidad, e

### Save the Embeddings.

In [25]:
# context 
filename = SAVE_TO + '/en_es_rand_100K_V_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_1.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

# word
filename = SAVE_TO + '/en_es_rand_100K_U_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_1.word_embeddings, f, pickle.HIGHEST_PROTOCOL)

### Evaluation

In [13]:
# load saved embeddings
with open(SAVE_TO + '/en_es_rand_100K_V_dec19.pkl','rb') as f:
    C_embedding1 = pickle.load(f)

In [15]:
# sanity checks
print('... C shape:', C_embedding1.shape)
print('... eval IDs should be > 10003:', EVAL_IDS[:5])
print('... number to eval:', len(EVAL_IDS))
print('... ground truth source language:', GTT_DF.columns[0])

... C shape: (20003, 200)
... eval IDs should be > 10003: [12873, 17315, 15943, 12143, 12575]
... number to eval: 9276
... ground truth source language: es


__Bilingual Induction Task__

In [16]:
from models import evaluateBLI

In [18]:
src_nbrs, tgt_nbrs = evaluateBLI(C_embedding1, vocab, GTT_DF, 
                                 EVAL_IDS, top_k = 10, verbose=True)

... Evaluating 9276 'es' Ground Truth Translations
... TF graph created for BiW2V model.
... TF graph created for BiW2V validation.
... finding neighbors...
... Done. Total successful translation rate: 0 (23 / 9276)


In [23]:
# visual check
for wrd_id in TEST_WORDS:
    try:
        idx = EVAL_IDS.index(wrd_id)
    except:
        continue
    synon = vocab.to_words(src_nbrs[idx])
    trans = vocab.to_words(tgt_nbrs[idx])
    print(vocab.to_words([wrd_id])[0],":")
    print(">>>", synon)
    print(">>>", trans)

es_primero :
>>> ['es_primero', 'es_plantel', 'es_atl\xc3\xa1ntico', 'es_japoneses', 'es_autoridades', 'es_fall', 'es_ampliaci\xc3\xb3n', 'es_directora', 'es_mateo', 'es_andina']
>>> ['en_afl', 'en_dating', 'en_macedonian', 'en_highest', 'en_dense', 'en_freud', 'en_blockade', 'en_pits', 'en_article', 'en_hydrogen']


# Method 2: Most Common Target Translation

### Initialize the model

In [27]:
from models import BiW2V_mle

# create model
model_2 = BiW2V_mle(bilingual_dict = translations,
                       vocab = vocab, H = EMBEDDING_SIZE)

# intialize TF graphs
model_2.BuildCoreGraph()
model_2.BuildTrainingGraph()
model_2.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [28]:
# fresh data generator
DATA_GENERATOR = batch_generator(raw_data, vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)

In [29]:
# train
start = time.time()
model_2.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = ALPHA)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(20003,) dtype=float32_ref>
... Starting Training
... STEP 0 : Average Loss : 0.00128500576019
   [en_the] closest:  en_cultivation, en_origin, en_trafficking, en_duo, en_disaster, es_aspecto, es_zoo, es_discutir,
   [en_last] closest:  en_austrian, en_denote, en_fight, es_berkeley, en_abortion, en_excessive, es_futuros, en_save,
   [es_si] closest:  es_diputado, en_psychiatry, es_regimiento, en_drained, en_arizona, es_antioquia, es_abandonada, es_mosela,
   [es_primero] closest:  es_best, en_marketplace, en_roots, en_detailing, es_cineasta, es_parlamento, en_terrestrial, en_continental,
... STEP 10000 : Average Loss : 4.37539166083
... STEP 20000 : Average Loss : 4.00909108711
   [en_the] closest:  en_a, en_origin, en_cultivation, en_his, en_and, en_s

### Save the Embeddings.

In [30]:
# context 
filename = SAVE_TO + '/en_es_mle_100K_V_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_2.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

# word
filename = SAVE_TO + '/en_es_mle_100K_U_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_2.word_embeddings, f, pickle.HIGHEST_PROTOCOL)

### Evaluation

In [24]:
# load saved embeddings
with open(SAVE_TO + '/en_es_mle_100K_V_dec19.pkl','rb') as f:
    C_embedding2 = pickle.load(f)

In [26]:
# sanity checks
print('... C shape:', C_embedding2.shape)
print('... eval IDs should be > 10003:', EVAL_IDS[:5])
print('... number to eval:', len(EVAL_IDS))
print('... ground truth source language:', GTT_DF.columns[0])

... C shape: (20003, 200)
... eval IDs should be > 10003: [12873, 17315, 15943, 12143, 12575]
... number to eval: 9276
... ground truth source language: es


__Bilingual Induction Task__

In [29]:
from models import evaluateBLI

In [None]:
src_nbrs, tgt_nbrs = evaluateBLI(C_embedding2, vocab, GTT_DF, 
                                 EVAL_IDS, top_k = 10, verbose=True)

... Evaluating 9276 'es' Ground Truth Translations
... TF graph created for BiW2V model.
... TF graph created for BiW2V validation.
... finding neighbors...


In [None]:
# visual check
for wrd_id in TEST_WORDS:
    try:
        idx = EVAL_IDS.index(wrd_id)
    except:
        continue
    synon = vocab.to_words(src_nbrs[idx])
    trans = vocab.to_words(tgt_nbrs[idx])
    print(vocab.to_words([wrd_id])[0],":")
    print(">>>", synon)
    print(">>>", trans)

# Method 3: Closest Translation

### Initialize the model

In [31]:
from models import BiW2V_nn

# create model
model_3 = BiW2V_nn(bilingual_dict = translations,
                   vocab = vocab, H = EMBEDDING_SIZE)

# intialize TF graphs
model_3.BuildCoreGraph()
model_3.BuildTrainingGraph()
model_3.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [32]:
# fresh data generator
DATA_GENERATOR = batch_generator(raw_data, vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)

In [33]:
# train
nBATCHES = 5000 # Takes too long w/ nn so we'll only do 5K
start = time.time()
model_3.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = ALPHA)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(20003,) dtype=float32_ref>
... Starting Training
... STEP 0 : Average Loss : 0.0236346282959
   [en_the] closest:  es_break, es_satisfacer, en_intervention, es_reverendo, en_aboriginal, en_gorge, es_controvertida, en_cancel,
   [en_last] closest:  en_furnace, en_angelo, es_emplazamiento, en_respond, es_más, es_ninguno, en_progressive, en_optimal,
   [es_si] closest:  es_canta, es_jurisdicción, es_ego, en_until, es_atlántica, es_colaboración, en_guidelines, en_noticed,
   [es_primero] closest:  en_space, en_scheduled, es_león, en_deficiency, es_primavera, en_catalogue, en_declaration, en_ponds,
... STEP 500 : Average Loss : 6.23121501386
... STEP 1000 : Average Loss : 5.38436549139
   [en_the] closest:  es_break, en_intervention, en_cancel, es_reverend

### Save the Embeddings.

In [34]:
# context 
filename = SAVE_TO + '/en_es_nn_5K_V_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_2.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

# word
filename = SAVE_TO + '/en_es_nn_5K_U_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_2.word_embeddings, f, pickle.HIGHEST_PROTOCOL)

### Evaluation

In [None]:
# load saved embeddings
with open(SAVE_TO + '/en_es_nn_5K_V_dec19.pkl','rb') as f:
    C_embedding3 = pickle.load(f)

In [None]:
# sanity checks
print('... C shape:', C_embedding3.shape)
print('... eval IDs should be > 10003:', EVAL_IDS[:5])
print('... number to eval:', len(EVAL_IDS))
print('... ground truth source language:', GTT_DF.columns[0])

__Bilingual Induction Task__

In [None]:
from models import evaluateBLI

In [None]:
src_nbrs, tgt_nbrs = evaluateBLI(C_embedding3, vocab, gtt_df, 
                                 EVAL_IDS, top_k = 10, verbose=True)

In [None]:
# visual check
for wrd_id in TEST_WORDS:
    try:
        idx = EVAL_IDS.index(wrd_id)
    except:
        continue
    synon = vocab.to_words(src_nbrs[idx])
    trans = vocab.to_words(tgt_nbrs[idx])
    print(vocab.to_words([wrd_id])[0],":")
    print(">>>", synon)
    print(">>>", trans)