# MLE Substitution
`w266 Final Project: Crosslingual Word Embeddings`

Instead of traning on randomly substituted words, here we'll choose the highest ranked translation.

# Notebook Setup

In [1]:
# general imports
from __future__ import print_function
import time
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# tell matplotlib not to open a new window
%matplotlib inline

# autoreload modules
%load_ext autoreload
%autoreload 2

Set base paths depending on your machine.

In [6]:
BASE = '/home/mmillervedam/Data'
PROJ = '/home/mmillervedam/ProjectRepo'
#PROJ = '/Users/mona/OneDrive/repos/final_proj/W266-Fall-2017-Final-Project'

# raw data
FULL_EN_ES = "/home/miwamoto/en_es_shuf.txt"
FULL_EN_IT = "/home/miwamoto/en_it_shuf.txt"

# vocabularies
VOCAB_EN_ES = BASE + '/vocab/en_es_index.pkl'
VOCAB_EN_IT = BASE + '/vocab/en_it_index.pkl'

# panlex dicts
PANLEX_EN_ES = BASE + '/panlex/en_es_dict.pkl'
PANLEX_EN_IT = BASE + '/panlex/en_it_dict.pkl'

# directory to save pickled embeddings
SAVE_TO = BASE + '/embeddings'

# English - Spanish
### Load Data, Dict & Vocab

In [7]:
from parsing import Corpus, BilingualVocabulary, batch_generator

In [8]:
# load corpus
en_es_data = Corpus(FULL_EN_ES)

In [9]:
# load panlex dictionary
with open(PANLEX_EN_ES,'rb') as f:
    en_es_translations = pickle.load(f)

In [10]:
# load vocabulary
en_es_vocab = BilingualVocabulary([], languages = ('en','es'))
with open(VOCAB_EN_ES,'rb') as f:
    en_es_vocab.load_from_index(pickle.load(f))

In [11]:
# confirmations
print('... loaded %s panlex translations'%(len(en_es_translations)))
print('... loaded %s word %svocabulary'%(en_es_vocab.size,en_es_vocab.language))

... loaded 702982 panlex translations
... loaded 200003 word ('en', 'es')vocabulary


### Validation Words

In [29]:
en_es_vocab.to_ids(['en_the','en_last', 'es_mundo', 'es_real'])

[3, 270, 100160, 100249]

In [27]:
for w in ['en_the','en_last', 'es_mundo', 'es_real']:
    print(w,list(en_es_translations[w][:5]))

en_the ['es_el', 'es_el_la_los_las', 'es_entonces', 'es_la', 'es_las']
en_last ['es_aguantar', 'es_anterior', 'es_atr\xc3\xa1s', 'es_conclusi\xc3\xb3n', 'es_concluyentemente']
es_mundo ['en_age', 'en_cosmos', 'en_creation', 'en_earth_globe', 'en_earth']
es_real ['en_actual', 'en_certain', 'en_clear', 'en_concise', 'en_de_facto']


 ### Batch Generator Sanity Check

In [17]:
BATCH_SIZE = 48
WINDOW_SIZE = 4
MAX_EPOCHS = 30 # fail safe

In [18]:
# STEP 1: recreate sentence & batch generators so they're insynch
token_gen = en_es_data.gen_tokens()
batch_gen = batch_generator(en_es_data, en_es_vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)

In [28]:
# STEP 2: have a look
print("ORIGINAL DATA:")
print([next(token_gen) for _ in range(BATCH_SIZE)])

for context, label in batch_gen:
    print("CONTEXT IDS:\n", context[:5])
    print("CONTEXT:\n", [en_es_vocab.to_words(c) for c in context[:5]])
    print("LABEL IDS:\n", label[:5])
    print("LABELS:\n", en_es_vocab.to_words(label[:5]))
    break

### Initialize the model

In [31]:
from models import BiW2V_random

EMBEDDING_SIZE = 200

# create model
model = BiW2V_random(bilingual_dict = en_es_translations,
                     vocab = en_es_vocab, 
                     H = EMBEDDING_SIZE)

# intialize TF graphs
model.BuildCoreGraph()
model.BuildTrainingGraph()
model.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [36]:
!wc {FULL_EN_ES}

   8573299  222459019 1902018560 /home/miwamoto/en_es_shuf.txt


In [41]:
222459019 / 48

4634562

In [43]:
# training parameters
nBATCHES = 600000 # less than 1 epoch
BATCH_SIZE = 48
WINDOW_SIZE = 4
MAX_EPOCHS = 5 # fail safe
DATA_GENERATOR = batch_generator(en_es_data, en_es_vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)
TEST_WORDS = en_es_vocab.to_ids(['en_the','en_last', 'es_mundo', 'es_real'])

In [44]:
# train
start = time.time()
model.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = 0.5)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(200003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(200003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(200003,) dtype=float32_ref>
... Starting Training
... STEP 0 : Average Loss : 0.000104637964567
   [en_the] closest:  es_eliminar, en_47north, en_austerity, en_then-wife, es_valido, en_binghamton, en_1986-87, es_canfranc,
   [en_last] closest:  en_nihilism, en_piplup, es_esconderse, es_accidentadas, en_readout, en_gmbh, en_redevelop, es_barú,
   [es_mundo] closest:  es_poets, en_criterium, en_graduate, en_kim, es_reset, es_juquila, en_-29, en_9x,
   [es_real] closest:  es_negociante, en_shades, es_fundaban, en_sensational, en_misaki, es_preclasificado, en_•, es_coles,
... STEP 60000 : Average Loss : 2.20431351974
... STEP 120000 : Average Loss : 1.85183114125
   [en_the] closest:  en_a, en_., en_'s, en_an, en_his, en_to, en_that, en_for,
   [en_last

### View & Save Trained Embeddings

In [45]:
# take a look at the embeddings
model.context_embeddings

array([[ -2.84318463e-04,   1.55181682e-04,   7.64811266e-05, ...,
         -5.75772647e-05,  -6.62243519e-06,   1.13493570e-05],
       [ -7.16788854e-05,   2.40120396e-04,  -1.49121872e-04, ...,
         -3.24801571e-04,  -3.24414665e-04,  -1.84669960e-04],
       [  1.31410881e-04,  -1.30790548e-04,   1.72700151e-04, ...,
          6.08587834e-05,   6.42145678e-05,   1.10936184e-04],
       ..., 
       [  1.36078175e-04,   1.21385499e-04,   2.41268077e-04, ...,
          2.29004600e-05,  -5.72097269e-06,   1.64323093e-04],
       [ -9.66184962e-05,  -1.97520858e-04,   2.46643118e-04, ...,
         -2.46647396e-04,  -3.95404604e-05,  -2.58605985e-04],
       [ -5.71376295e-05,  -1.30071737e-06,  -2.53239996e-04, ...,
          1.78198912e-04,  -8.43432063e-05,  -2.27683529e-04]], dtype=float32)

In [46]:
# take a look at the embeddings
model.word_embeddings

array([[ -2.08320143e-03,   1.13701622e-03,   5.60377259e-04, ...,
         -4.21868666e-04,  -4.85225864e-05,   8.31567449e-05],
       [ -5.25191252e-04,   1.75936229e-03,  -1.09261612e-03, ...,
         -2.37982115e-03,  -2.37698643e-03,  -1.35307701e-03],
       [  9.62847553e-04,  -9.58302408e-04,   1.26537413e-03, ...,
          4.45912330e-04,   4.70500207e-04,   8.12829472e-04],
       ..., 
       [  9.97044845e-04,   8.89391638e-04,   1.76777132e-03, ...,
          1.67791688e-04,  -4.19175703e-05,   1.20399543e-03],
       [ -7.07923784e-04,  -1.44723547e-03,   1.80715427e-03, ...,
         -1.80718559e-03,  -2.89712974e-04,  -1.89480628e-03],
       [ -4.18647425e-04,  -9.53035669e-06,  -1.85548968e-03, ...,
          1.30566361e-03,  -6.17982703e-04,  -1.66823738e-03]], dtype=float32)

In [47]:
# saving final embeddings in case we want to do more stuff later
filename = SAVE_TO + '/en_es_rand_600K_cw4_V_dec18.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

filename = SAVE_TO + '/en_es_rand_600K_cw4_U_dec18.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model.word_embeddings, f, pickle.HIGHEST_PROTOCOL)

In [48]:
# confirm reload:
filename = SAVE_TO + '/en_es_rand_600K_cw4_V_dec18.pkl'
with open(filename, 'rb') as f:
    C_embedding = pickle.load(f)

C_embedding

array([[ -2.84318463e-04,   1.55181682e-04,   7.64811266e-05, ...,
         -5.75772647e-05,  -6.62243519e-06,   1.13493570e-05],
       [ -7.16788854e-05,   2.40120396e-04,  -1.49121872e-04, ...,
         -3.24801571e-04,  -3.24414665e-04,  -1.84669960e-04],
       [  1.31410881e-04,  -1.30790548e-04,   1.72700151e-04, ...,
          6.08587834e-05,   6.42145678e-05,   1.10936184e-04],
       ..., 
       [  1.36078175e-04,   1.21385499e-04,   2.41268077e-04, ...,
          2.29004600e-05,  -5.72097269e-06,   1.64323093e-04],
       [ -9.66184962e-05,  -1.97520858e-04,   2.46643118e-04, ...,
         -2.46647396e-04,  -3.95404604e-05,  -2.58605985e-04],
       [ -5.71376295e-05,  -1.30071737e-06,  -2.53239996e-04, ...,
          1.78198912e-04,  -8.43432063e-05,  -2.27683529e-04]], dtype=float32)

### Vizualization ???

In [106]:
wrds = "en_the en_a en_this en_'s en_an en_their en_its en_these en_his \
       en_first en_on en_in en_for en_to en_with en_are en_. en_all \
       it_nuovo it_di it_un it_, <s> it_i it_con it_è it_più it_parola \
       en_censorship en_minima it_profesional it_michail en_الأول \
       it_ritengono en_prevention it_mckenna en_third".split()

In [107]:
for w in ["en_the", "en_first", "it_nuovo", "it_parole"]:
    wrds += list(bi_dict[w])

In [108]:
wordset = set(en_it_vocab.to_ids(wrds))

In [110]:
model2.plot_embeddings_in_2D(wordset)

NameError: global name 'TSNE' is not defined

# MLE Starts HERE

### Initialize the model

In [61]:
from models import BiW2V_mle

EMBEDDING_SIZE = 200

# create model
model1 = BiW2V_mle(bilingual_dict = en_es_translations,
                     vocab = en_es_vocab, 
                     H = EMBEDDING_SIZE)

# intialize TF graphs
model1.BuildCoreGraph()
model1.BuildTrainingGraph()
model1.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [62]:
# training parameters
nBATCHES = 600000 # less than 1 epoch
BATCH_SIZE = 48
WINDOW_SIZE = 4
MAX_EPOCHS = 5 # fail safe
DATA_GENERATOR = batch_generator(en_es_data, en_es_vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)
TEST_WORDS = en_es_vocab.to_ids(['en_the','en_last', 'es_mundo', 'es_real'])

In [63]:
# train
start = time.time()
model1.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = 0.5)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(200003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(200003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(200003,) dtype=float32_ref>
... Starting Training
... STEP 0 : Average Loss : 0.000104258473714
   [en_the] closest:  es_faunas, en_tetrarchy, es_redován, es_mangano, en_mapsto, en_guastalla, es_grabarían, en_238,
   [en_last] closest:  en_lightspeed, en_weisman, es_acertadas, en_6-8, en_10-7, es_eredivisie, es_sarcoidosis, es_taichi,
   [es_mundo] closest:  en_scholarship, es_grp, en_friendlies, en_appalachia, es_celtíbera, en_dichotomy, en_unto, en_fian,
   [es_real] closest:  en_gyeongsang, es_galen, en_indentured, en_personage, en_santé, en_manual, es_ctm, en_caguas,
... STEP 60000 : Average Loss : 2.20471700367
... STEP 120000 : Average Loss : 1.85234213303
   [en_the] closest:  en_a, en_., en_'s, en_an, en_his, en_its, en_their, en_to,
   [en_

### View & Save Trained Embeddings

In [64]:
# take a look at the embeddings
model1.context_embeddings

array([[ -1.05455947e-04,   2.86336963e-05,  -1.84920864e-04, ...,
         -1.58480965e-04,  -1.00291945e-04,  -1.74433619e-04],
       [  2.75573053e-04,  -1.77735696e-04,   3.85022897e-04, ...,
          1.47817482e-04,   3.12980497e-04,  -2.16852786e-04],
       [ -1.72919987e-04,  -1.40841832e-04,   1.59651914e-04, ...,
         -2.03922173e-04,  -1.85943486e-06,   2.17361230e-04],
       ..., 
       [  6.11688884e-05,   2.09798236e-04,  -3.35832774e-05, ...,
         -4.93436892e-05,   5.14835165e-06,  -1.18142401e-04],
       [ -4.56703710e-05,   2.44815776e-04,  -2.59716689e-05, ...,
         -1.92585194e-05,  -3.75785567e-05,  -1.30966378e-04],
       [  4.98521388e-07,   9.05969064e-05,  -3.35373807e-05, ...,
         -2.10497194e-04,   2.58300832e-04,   1.47561132e-05]], dtype=float32)

In [65]:
# take a look at the embeddings
model1.word_embeddings

array([[ -7.72689295e-04,   2.09802776e-04,  -1.35493896e-03, ...,
         -1.16121036e-03,  -7.34851987e-04,  -1.27809762e-03],
       [  2.01915926e-03,  -1.30229234e-03,   2.82111228e-03, ...,
          1.08307775e-03,   2.29324843e-03,  -1.58890826e-03],
       [ -1.26700697e-03,  -1.03196618e-03,   1.16979005e-03, ...,
         -1.49416400e-03,  -1.36243179e-05,   1.59263366e-03],
       ..., 
       [  4.48192324e-04,   1.53721869e-03,  -2.46069016e-04, ...,
         -3.61547543e-04,   3.77226352e-05,  -8.65644601e-04],
       [ -3.34632699e-04,   1.79379666e-03,  -1.90297767e-04, ...,
         -1.41109645e-04,  -2.75342929e-04,  -9.59607540e-04],
       [  3.65273013e-06,   6.63815183e-04,  -2.45732692e-04, ...,
         -1.54234003e-03,   1.89260358e-03,   1.08119944e-04]], dtype=float32)

In [66]:
# saving final embeddings in case we want to do more stuff later
filename = SAVE_TO + '/en_es_mle_600K_cw4_V_dec18.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model1.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

filename = SAVE_TO + '/en_es_mle_600K_cw4_U_dec18.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model1.word_embeddings, f, pickle.HIGHEST_PROTOCOL)

In [67]:
# confirm reload:
filename = SAVE_TO + '/en_es_mle_600K_cw4_V_dec18.pkl'
with open(filename, 'rb') as f:
    C_embedding = pickle.load(f)

C_embedding

array([[ -1.05455947e-04,   2.86336963e-05,  -1.84920864e-04, ...,
         -1.58480965e-04,  -1.00291945e-04,  -1.74433619e-04],
       [  2.75573053e-04,  -1.77735696e-04,   3.85022897e-04, ...,
          1.47817482e-04,   3.12980497e-04,  -2.16852786e-04],
       [ -1.72919987e-04,  -1.40841832e-04,   1.59651914e-04, ...,
         -2.03922173e-04,  -1.85943486e-06,   2.17361230e-04],
       ..., 
       [  6.11688884e-05,   2.09798236e-04,  -3.35832774e-05, ...,
         -4.93436892e-05,   5.14835165e-06,  -1.18142401e-04],
       [ -4.56703710e-05,   2.44815776e-04,  -2.59716689e-05, ...,
         -1.92585194e-05,  -3.75785567e-05,  -1.30966378e-04],
       [  4.98521388e-07,   9.05969064e-05,  -3.35373807e-05, ...,
         -2.10497194e-04,   2.58300832e-04,   1.47561132e-05]], dtype=float32)