# English - Dutch Embeddings (3 versions)
`w266 Final Project: Crosslingual Word Embeddings`

Instead of traning on randomly substituted words, here we'll choose the translation that is closest to the context embedding vector.

# Notebook Setup

In [1]:
# general imports
from __future__ import print_function
import time
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

# tell matplotlib not to open a new window
%matplotlib inline

# autoreload modules
%load_ext autoreload
%autoreload 2

__Base Paths__

In [8]:
MV_BASE = '/home/mmillervedam/Data'
MI_BASE = '/home/miwamoto/Data'
PROJ = '/home/mmillervedam/ProjectRepo'
#PROJ = '/Users/mona/OneDrive/repos/final_proj/W266-Fall-2017-Final-Project'
GTT_BASE = PROJ + '/BaselineModels/data/ground_truth_translations/'

# directory to save pickled embeddings
SAVE_TO = MV_BASE + '/embeddings'

__Globals__ - _the parameters below fully determine all 3 models in this NB_

In [9]:
# Data
LANG = ('en','nl')
FULL_TEXT = "/home/miwamoto/en_nl_shuf.txt"
VOCAB_INDEX = MV_BASE + '/vocab/en_nl_small.pkl'
PANLEX = MI_BASE + '/panlex/en_nl_dict.pkl'
GTT_PATH = GTT_BASE + "%s-%s-clean.csv" % (LANG[1], LANG[0])

# Model
EMBEDDING_SIZE = 200

# Training
nBATCHES = 100000 # <<< 1 epoch with our 1 million sentence corpus
BATCH_SIZE = 48
WINDOW_SIZE = 4
MAX_EPOCHS = 5 # fail safe
ALPHA = 0.5 # authors use a much smaller learning rate but train longer

# Load Data

In [4]:
from parsing import Corpus, BilingualVocabulary, batch_generator, get_common_words

In [5]:
# load corpus
raw_data = Corpus(FULL_TEXT)

In [6]:
# load panlex dictionary
with open(PANLEX,'rb') as f:
    translations = pickle.load(f)

In [10]:
# load vocabulary
vocab = BilingualVocabulary([], languages = LANG)
with open(VOCAB_INDEX,'rb') as f:
    vocab.load_from_index(pickle.load(f))

In [11]:
# confirmations
print('... loaded %s panlex translations'%(len(translations)))
print('... loaded %s word %s vocabulary'%(vocab.size,vocab.language))

... loaded 437931 panlex translations
... loaded 20003 word ('en', 'nl') vocabulary


In [26]:
# Validation Words (for training printout)
TEST_WORDS = vocab.to_ids(['en_the','en_last', 'nl_voor', 'nl_aantal'])
print('... test word ids:', TEST_WORDS)

... test word ids: [3, 226, 10010, 10065]


In [13]:
# Ground Truth Translations
GTT_DF = pd.read_csv(GTT_PATH, names = [LANG[1], LANG[0]], sep=' ', header=None)
print('... loaded %s ground truth translations.'%(len(GTT_DF)))

... loaded 93854 ground truth translations.


In [14]:
# Evaluation Words (for reporting recall)
eval_words = [w for w in get_common_words(vocab) if w.startswith(LANG[1])]
EVAL_IDS = vocab.to_ids(eval_words)
print('... loaded %s evaluation words.' % (len(EVAL_IDS)))

... loaded 0 evaluation words.


# Method 1: Random Translations

### Initialize the model

In [27]:
from models import BiW2V_random

# create model
model_1 = BiW2V_random(bilingual_dict = translations,
                       vocab = vocab, H = EMBEDDING_SIZE)

# intialize TF graphs
model_1.BuildCoreGraph()
model_1.BuildTrainingGraph()
model_1.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [28]:
# fresh data generator
DATA_GENERATOR = batch_generator(raw_data, vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)

In [29]:
# train
start = time.time()
model_1.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = ALPHA)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(20003,) dtype=float32_ref>
... Starting Training
... STEP 0 : Average Loss : 0.00147580852509
   [en_the] closest:  en_boycott, en_grades, en_uefa, en_minnesota, en_perhaps, en_punt, nl_ballet, en_banking,
   [en_last] closest:  en_sabbath, en_swords, en_orthodoxy, en_organisation, nl_schoon, en_lab, nl_stal, nl_doetinchem,
   [nl_voor] closest:  en_coup, nl_born, nl_grip, nl_statuten, en_rather, nl_zand, en_rebellion, en_branches,
   [nl_aantal] closest:  nl_oosterhout, en_mushroom, en_session, nl_ingesloten, nl_king, en_fire, nl_duiven, nl_opzicht,
... STEP 10000 : Average Loss : 4.26000624292
... STEP 20000 : Average Loss : 3.88555000093
   [en_the] closest:  en_a, en_his, en_minnesota, en_grades, en_boycott, en_uefa, nl_westwood, en_garland,
   [e

### Save the Embeddings.

In [30]:
# context 
filename = SAVE_TO + '/en_nl_rand_100K_V_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_1.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

# word
filename = SAVE_TO + '/en_nl_rand_100K_U_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_1.word_embeddings, f, pickle.HIGHEST_PROTOCOL)

# Method 2: Most Common Target Translation

### Initialize the model

In [31]:
from models import BiW2V_mle

# create model
model_2 = BiW2V_mle(bilingual_dict = translations,
                       vocab = vocab, H = EMBEDDING_SIZE)

# intialize TF graphs
model_2.BuildCoreGraph()
model_2.BuildTrainingGraph()
model_2.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [32]:
# fresh data generator
DATA_GENERATOR = batch_generator(raw_data, vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)

In [33]:
# train
start = time.time()
model_2.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = ALPHA)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(20003,) dtype=float32_ref>
... Starting Training
... STEP 0 : Average Loss : 0.00148056278229
   [en_the] closest:  en_heats, nl_continu, nl_vijftig, en_sioux, nl_restaureren, en_enduring, nl_burggraaf, nl_jaartelling,
   [en_last] closest:  nl_procureur, nl_jagers, nl_normaal, en_wade, nl_onderzoek, nl_leeuw, en_gum, en_skip,
   [nl_voor] closest:  en_ghana, nl_plafond, en_bilingual, nl_grotendeels, nl_stuart, en_appearances, en_joaquin, nl_luisteren,
   [nl_aantal] closest:  nl_roer, en_silesia, en_irrigation, nl_kwalificeren, nl_jommeke, en_medley, nl_afgelegen, nl_limiet,
... STEP 10000 : Average Loss : 4.04762265734
... STEP 20000 : Average Loss : 3.6590352628
   [en_the] closest:  en_a, en_heats, en_and, en_passive, nl_continu, nl_restaureren, e

### Save the Embeddings.

In [35]:
# context 
filename = SAVE_TO + '/en_nl_mle_100K_V_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_2.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

# word
filename = SAVE_TO + '/en_nl_mle_100K_U_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_2.word_embeddings, f, pickle.HIGHEST_PROTOCOL)

# Method 3: Closest Translation

### Initialize the model

In [36]:
from models import BiW2V_nn

# create model
model_3 = BiW2V_nn(bilingual_dict = translations,
                   vocab = vocab, H = EMBEDDING_SIZE)

# intialize TF graphs
model_3.BuildCoreGraph()
model_3.BuildTrainingGraph()
model_3.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [37]:
# fresh data generator
DATA_GENERATOR = batch_generator(raw_data, vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)

In [38]:
# train
nBATCHES = 5000 # Takes too long w/ nn so we'll only do 5K
start = time.time()
model_3.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = ALPHA)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(20003, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(20003,) dtype=float32_ref>
... Starting Training
... STEP 0 : Average Loss : 0.0290604896545
   [en_the] closest:  en_hospitality, en_communicate, en_nichols, nl_vastleggen, en_transformation, nl_heilige, nl_schuin, nl_verloor,
   [en_last] closest:  nl_aveyron, nl_valentine, nl_koersen, nl_aruba, en_suburban, en_armenians, en_virgil, en_blank,
   [nl_voor] closest:  en_correspondent, nl_duisburg, nl_steenbergen, nl_fleming, nl_achterste, en_combine, nl_stephen, nl_juventus,
   [nl_aantal] closest:  nl_estisch, nl_plato, nl_wijd, en_ms., en_voyage, en_belong, en_obsession, en_genetic,
... STEP 500 : Average Loss : 5.9849984839
... STEP 1000 : Average Loss : 5.24351522017
   [en_the] closest:  en_hospitality, nl_verloor, nl_laser, en_acronym, en_nichol

### Save the Embeddings.

In [40]:
# context 
filename = SAVE_TO + '/en_nl_nn_5K_V_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_2.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

# word
filename = SAVE_TO + '/en_nl_nn_5K_U_dec19.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model_2.word_embeddings, f, pickle.HIGHEST_PROTOCOL)