# Simple Substitution
`w266 Final Project: Crosslingual Word Embeddings`

The code in this notebook was used to develop an algorithm to generate crosslingual word embeddings by training on a monolingual corpus and substituting translations at runtime.

# Notebook Setup

In [61]:
# general imports
from __future__ import print_function
import time
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# tell matplotlib not to open a new window
%matplotlib inline

# autoreload modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# filepaths
BASE = '/Users/mmillervedam/Documents/MIDS/w266' #'/home/mmillervedam/' 
PROJ = '/Users/mmillervedam/Documents/MIDS/w266/FinalProject'#'/home/mmillervedam/ProjectRepo'
FPATH_EN = BASE + '/Data/test/wiki_en_10K.txt' # first 10000 lines from wiki dump
FPATH_ES = BASE + '/Data/test/wiki_es_10K.txt' # first 10000 lines from wiki dump
#FULL_EN = BASE + '/Data/en/full.txt'
#FULL_ES = BASE + '/Data/es/full.txt'
EN_ES_DICT = PROJ +'/XlingualEmb/data/dicts/en.es.panlex.all.processed'
EN_IT_DICT  = PROJ +'/XlingualEmb/data/dicts/en.it.panlex.all.processed'
EN_IT_RAW = PROJ + '/XlingualEmb/data/mono/en_it.shuf.10k'
EN_IT_RAW = PROJ + '/XlingualEmb/data/mono/en_it.shuf.10k'

In [66]:
# directory to save pickled embeddings
SAVE_TO = PROJ + '/Notebooks/embeddings'

# Load & Preprocess Data
__`ORIGINAL AUTHORS SAY:`__ "Normally, the monolingual word embeddings are trained on billions of words. However, getting that much of monolingual data for a low-resource language is also challenging. That is why we only select the top 5 million sentences (around 100 million words) for each language." - _Section 5.1, Duong et. al._ 

In [3]:
from parsing import Corpus, Vocabulary, batch_generator

### Corpus

In [4]:
# load corpus
en_it_data = Corpus(EN_IT_RAW)

In [5]:
# Corpus Stats
!wc {EN_IT_RAW}

   20000  430928 3746786 /Users/mmillervedam/Documents/MIDS/w266/FinalProject/XlingualEmb/data/mono/en_it.shuf.10k


__`i.e.:`__ 20K sentences (10K in each language) with ~430K tokens
> So this must not be their full data For now, I'm just going to look at the top 20K words and see what happens. In reality we should probably modify the Vocab class so that it explicily collects the top words for each language separately and then concatenates the index.

### Dictionary

In [6]:
# loading english-italian dictionary
pld = pd.read_csv(EN_IT_DICT, sep='\t', names = ['en', 'it'], dtype=str)
en_set = set(pld.en.unique())
it_set = set(pld.it.unique())

In [7]:
# dictionary vocab lengths:
print('EN:', len(en_set))
print('IT:', len(it_set))

EN: 266450
IT: 258641


In [8]:
# Create dictionary for ease of runtime translation
# WARNING this takes a sec to run
bi_dict = pld.groupby(['en'])['it'].unique().to_dict()

In [9]:
# add other direction
# WARNING this takes another sec to run
bi_dict.update(pld.groupby(['it'])['en'].unique().to_dict())

In [10]:
# demo en to it
bi_dict['en_go'][:5]

array(['it_aggirare', 'it_andai', 'it_andara', 'it_andare',
       'it_andare_avanti'], dtype=object)

In [11]:
# demo it to en
bi_dict['it_ciao'][:5]

array(['en_adieu', 'en_bye-bye', 'en_bye', 'en_cheerio', 'en_ciao'], dtype=object)

### Vocabulary

In [12]:
# train multilingual Vocabulary
en_it_vocab = Vocabulary(en_it_data.gen_tokens(), size = 50000)

In [13]:
# length of corpus vocabulary
en_it_vocab.size

48579

In [14]:
# overlap with dictionary vocabulary
len([w for w in en_it_vocab.types if w in bi_dict])

24176

### CBOW Data Generator
__`CHECK PAPER for HYPERPARAMS!`__: I can't seem to find where they talk abou the context window size, embedding size and batch size they use -- it may actually be in the Vulic and Moens paper instead of the Duong one.

__`RLH Update`__: Duong et al. section 6, footnote 4: "Default learning rate of 0.025, negative sampling with 25 samples, subsampling rate of value 1e−4, embedding dimension d = 200, window size cs = 48 and run for 15 epochs"


In [15]:
BATCH_SIZE = 20
WINDOW_SIZE = 1
MAX_EPOCHS = 1 # fail safe

In [16]:
batched_data = batch_generator(en_it_data, 
                               en_it_vocab, 
                               BATCH_SIZE, 
                               WINDOW_SIZE, 
                               MAX_EPOCHS)

In [17]:
# sanity check
for context, label in batched_data:
    print("CONTEXT IDS:", context[:5])
    print("CONTEXT:", [en_it_vocab.to_words(c) for c in context[:5]])
    print("LABEL IDS:", label[:5])
    print("LABELS:", en_it_vocab.to_words(label[:5]))
    break

CONTEXT IDS: [[0, 1], [0, 1], [0, 34], [20, 15624], [34, 1584]]
CONTEXT: [['<s>', '</s>'], ['<s>', '</s>'], ['<s>', 'it_un'], ['it_in', 'it_remoto'], ['it_un', 'it_passato']]
LABEL IDS: [43790, 24849, 20, 34, 15624]
LABELS: ['it_[[877881]]', 'it_[[879362]]', 'it_in', 'it_un', 'it_remoto']


# Fun Validation Words

In [18]:
en_it_vocab.to_ids(['en_the','en_first', 'it_nuovo', 'it_parola'])

[3, 84, 669, 6646]

In [19]:
bi_dict['en_the']

array(['it_della', 'it_gli', 'it_i', 'it_il', 'it_la', 'it_l\xc3\xa0',
       'it_le', 'it_lo', 'it_ma'], dtype=object)

In [20]:
bi_dict['en_first']

array(['it_anteriore', 'it_anteriormente', 'it_antico', 'it_anzitutto',
       'it_anzi_tutto', 'it_avvio', 'it_dapprima', 'it_davanti',
       'it_di_fronte', 'it_il_primo', 'it_in', 'it_in_cima', 'it_inizio',
       'it_innanzitutto', 'it_innanzi_tutto', 'it_in_primis',
       'it_in_primo_luogo', 'it_la_prima', 'it_per_la_prima_volta',
       'it_per_primo', 'it_precedente', 'it_prima', 'it_prima_di_tutto',
       'it_primariamente', 'it_primario', 'it_prime', 'it_primieramente',
       'it_primiero', 'it_primo', 'it_principale', 'it_principio'], dtype=object)

In [21]:
bi_dict['it_nuovo']

array(['en_fresh', 'en_green', 'en_latter-day', 'en_new', 'en_novel',
       'en_raw', 'en_recent', 'en_renewed', 'en_unexampled', 'en_unused',
       'en_young'], dtype=object)

In [22]:
bi_dict['it_parola']

array(['en_drake', 'en_language', 'en_mot', 'en_parole', 'en_promise',
       'en_-shaped', 'en_speech', 'en_term', 'en_tongue', 'en_verb',
       'en_vocable', 'en_word_of_honor', 'en_word'], dtype=object)

# Base Model - no word sub yet!
__`CODE NOTES:`__ To get this running I had to hard code the context length (set to 2) inside `BuildCoreGraph()` where we generate `self.input_` in line 102. That should really be inferred from the `self.context_` itself but it doesn't seem to like the placeholder dimension (we don't have a span length until runtime). Does tensorflow not have a vectorized average? Something to fix (later). I also had to hard code the number of samples for softmax (I had originally put this as a `tf.placeholder_with_default` thinking we could pass it in to the training function (since its a training parameter) but TF kicked out an error message asking for an integer so for now I'll just give it what it wants. I need to think more about why TF doesn't want this changing from batch to batch. (or if there is another reason it wants an int).

### Fresh Data Generator

In [46]:
BATCH_SIZE = 20
WINDOW_SIZE = 1
MAX_EPOCHS = 15 # fail safe

batched_data = batch_generator(en_it_data, en_it_vocab, BATCH_SIZE, 
                               WINDOW_SIZE, MAX_EPOCHS)

### Initialize the model

In [47]:
from models import BiW2V

EMBEDDING_SIZE = 200

# create model
model = BiW2V(index = en_it_vocab.index, H = EMBEDDING_SIZE)

# intialize TF graphs
model.BuildCoreGraph()
model.BuildTrainingGraph()
model.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

__`IMPORTANT!`__ right now the model only works with a window of 1 because the feed dict can't handle context windows of different lengths. We'll either need to figure out how to have a variable length dimension or else add extra padding to the sentences to account for the window size.

In [48]:
# time
start = time.time()

# training parameters
TEST_WORDS = [3, 84, 669, 6646] # en_the, en_first, it_nuovo, it_parole
nBATCHES = 300000 # ~ 14 epochs
DATA_GENERATOR = batched_data

# training call
model.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = 0.15)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(48579, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(48579, 200) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(48579,) dtype=float32_ref>
... Starting Training
... STEP  0 : Average Loss : 0.00021203883489
   >>> [en_the] nbrs:  it_perde, en_manager, it_realizzarsi, en_overseeing, it_flags, en_troad, it_chiusi, it_reintegrata,
   >>> [en_first] nbrs:  en_anarcho-pacifism, it_suggerimento, en_boroughs, en_teesside, en_deprived, en_bulletin, en_predation, it_gobbo,
   >>> [it_nuovo] nbrs:  it_indifferentemente, it_porche, it_bondi, it_carlo, it_indennizzare, en_principlism, en_consult, it_nicaragua,
   >>> [it_parola] nbrs:  it_configurazione, it_l'eterogeneità, en_belleville, en_outsiders, en_respectively, it_mercatini, it_[[877979]], it_calcificazione,
... STEP  30000 : Average Loss : 3.95838090201
... STEP  60000 : Average Loss : 2.98953760585
   >>> [en_the]

__NOTES:__ This is just a context of 1 (ie. window = 3) and there's no bilingual signal. When I ran it w/ the default learning rate there was mad overfitting for `the`'s neighbors but `first` had some much better results (eg. `third` and `only`). It would be interesting to really tune the hyperparamters to see how good we could do (this is essentially monolingual word2vec with two languages)... as a point of comparison for the bilingual versions below.

In [49]:
# take a look at the embeddings
model.context_embeddings

array([[  1.02402642e-03,   4.16290626e-04,   8.46864350e-05, ...,
         -1.02722656e-03,  -7.92390842e-04,   8.39135144e-04],
       [  3.52726347e-04,  -4.58918861e-04,   2.22952978e-04, ...,
         -4.55987407e-04,   8.35586034e-05,   2.18325004e-04],
       [  4.07747721e-04,  -4.21624805e-04,  -3.38321872e-04, ...,
          9.61679689e-06,   3.33360600e-04,   2.10861690e-04],
       ..., 
       [ -4.02371108e-04,   3.36044555e-04,   2.32033111e-04, ...,
          2.27759840e-04,   1.72980988e-04,   3.51097609e-04],
       [ -2.04241878e-04,  -1.52394496e-04,   3.81836639e-04, ...,
          1.73038512e-04,  -1.08486944e-04,   1.55255693e-04],
       [  1.15991745e-04,  -3.61194252e-04,  -3.92610062e-04, ...,
          5.17776061e-04,   4.07395535e-04,   2.53971462e-04]], dtype=float32)

__`Hmmmm...`__ These don't look normalized to me. Something to return to?

# Model with Random Translation

### Fresh Data

In [57]:
BATCH_SIZE = 20
WINDOW_SIZE = 1
MAX_EPOCHS = 30 # fail safe

batched_data = batch_generator(en_it_data, en_it_vocab, BATCH_SIZE, 
                               WINDOW_SIZE, MAX_EPOCHS)

### Initialize

In [58]:
from models import BiW2V_random

EMBEDDING_SIZE = 128

# create model
model2 = BiW2V_random(('en', 'it'), bi_dict, en_it_vocab.to_ids,
                      index = en_it_vocab.index, 
                      H = EMBEDDING_SIZE)

# intialize TF graphs
model2.BuildCoreGraph()
model2.BuildTrainingGraph()
model2.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Train

In [59]:
# training parameters
TEST_WORDS = [3, 84, 669, 6646] # en_the, en_first, it_nuovo, it_parole
nBATCHES = 600000 # ~ 14 epochs
DATA_GENERATOR = batched_data

In [60]:
# training call
start = time.time()
model2.train(nBATCHES, DATA_GENERATOR, TEST_WORDS, learning_rate = 0.05)
tot = (time.time() - start)
print('... {} batches trained in {} seconds'.format(nBATCHES, tot))

... Model Initialized
	 <tf.Variable 'Embedding_Layer/ContextEmbeddings:0' shape=(48579, 128) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/WordEmbeddings:0' shape=(48579, 128) dtype=float32_ref>
	 <tf.Variable 'Hidden_Layer/b:0' shape=(48579,) dtype=float32_ref>
... Starting Training
... STEP  0 : Average Loss : 0.000111448200544
   [en_the] sim words:  en_psychical, en_slogan, it_balbo, it_race, en_1/4, it_regina, it_causando, it_[[879322]],
   [en_first] sim words:  it_interviú, it_fermarsi, it_tanfo, it_navate, en_delivers, en_sworn, it_avvocatura, it_fortificare,
   [it_nuovo] sim words:  en_dismissal, it_bevve, en_septa, en_implements, it_telegiornale, it_marcello, it_acciaio, it_roxx,
   [it_parola] sim words:  en_censorship, en_minima, it_profesional, it_ritengono, it_michail, en_الأول, it_mckenna, en_prevention,
... STEP  60000 : Average Loss : 4.30245931981
... STEP  120000 : Average Loss : 3.33011244143
   [en_the] sim words:  en_a, en_,, en_., en_in, en_and, en_'s, en_of,

__`NOTES:`__ Same words look reasonable in the English examples. I'd be interesting in training this longer to see if that helps but its probably worth fixing the context window issue first.

In [70]:
# saving final embeddings in case we want to do more stuff later
filename = SAVE_TO + '/en_it_rand_600K_cw1_V_dec15.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model2.context_embeddings, f, pickle.HIGHEST_PROTOCOL)

filename = SAVE_TO + '/en_it_rand_600K_cw1_U_dec15.pkl'
with open(filename, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model2.word_embeddings, f, pickle.HIGHEST_PROTOCOL)

In [73]:
# confirm:
filename = SAVE_TO + '/en_it_rand_600K_cw1_U_dec15.pkl'
with open(filename, 'rb') as f:
    C_embedding = pickle.load(f)
    
C_embedding

array([[ -1.42820040e-02,  -1.08593737e-03,  -9.77827888e-03, ...,
         -7.43078999e-03,  -9.79084056e-04,  -7.75979646e-03],
       [  6.15398725e-03,   3.99457384e-03,  -7.21636403e-04, ...,
         -4.45156882e-04,   4.76947054e-03,   4.01509507e-03],
       [  1.42526999e-03,  -2.62570567e-03,   6.83171733e-04, ...,
         -2.14850740e-03,  -6.21526386e-04,   8.00127018e-05],
       ..., 
       [ -4.83615768e-05,  -7.35577720e-04,   2.69850646e-03, ...,
         -1.72964076e-03,   2.55509047e-03,   6.92204339e-04],
       [  3.46941641e-03,   5.76911087e-04,   7.60798051e-04, ...,
          3.49387084e-03,   3.47503019e-03,  -1.87479728e-03],
       [ -7.38707022e-04,  -1.68911624e-03,  -2.75655207e-03, ...,
         -2.23345775e-03,  -2.73358473e-03,   1.52106478e-03]], dtype=float32)

# Viz and Save

In [99]:
wrds = "en_the en_a en_this en_'s en_an en_their en_its en_these en_his \
       en_first en_on en_in en_for en_to en_with en_are en_. en_all \
       it_nuovo it_di it_un it_, <s> it_i it_con it_è it_più it_parola \
       en_censorship en_minima it_profesional it_michail en_الأول \
       it_ritengono en_prevention it_mckenna en_third".split()

In [102]:
for w in ["en_the", "en_first", "it_nuovo", "it_parole"]:
    wrds.append(bi_dict[w])

In [105]:
wordset = set(en_it_vocab.to_ids(list(wrds)))

TypeError: unhashable type: 'numpy.ndarray'

In [88]:
bi_dict['it_nuovo']

array(['en_fresh', 'en_green', 'en_latter-day', 'en_new', 'en_novel',
       'en_raw', 'en_recent', 'en_renewed', 'en_unexampled', 'en_unused',
       'en_young'], dtype=object)

In [None]:
model2.plot_embeddings_in_2D()