# Cosine Similarity Substitution
`w266 Final Project: Crosslingual Word Embeddings`

Instead of traning on randomly substituted words, here we'll choose the translation that is closest to the context embedding vector.

# Notebook Setup

In [1]:
# general imports
from __future__ import print_function
import time
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

# tell matplotlib not to open a new window
%matplotlib inline

# autoreload modules
%load_ext autoreload
%autoreload 2

Set base paths depending on your machine.

In [2]:
BASE = '/home/mmillervedam/Data'
PROJ = '/home/mmillervedam/ProjectRepo'
#PROJ = '/Users/mona/OneDrive/repos/final_proj/W266-Fall-2017-Final-Project'

# raw data
FULL_EN_ES = "/home/miwamoto/en_es_shuf.txt"
FULL_EN_IT = "/home/miwamoto/en_it_shuf.txt"

# vocabularies
VOCAB_EN_ES = BASE + '/vocab/en_es_index.pkl'
VOCAB_EN_IT = BASE + '/vocab/en_it_index.pkl'

# panlex dicts
PANLEX_EN_ES = BASE + '/panlex/en_es_dict.pkl'
PANLEX_EN_IT = BASE + '/panlex/en_it_dict.pkl'

# directory to save pickled embeddings
SAVE_TO = BASE + '/embeddings'

# English - Spanish
### Load Data, Dict & Vocab

In [3]:
from parsing import Corpus, BilingualVocabulary, batch_generator

In [4]:
# load corpus
en_es_data = Corpus(FULL_EN_ES)

In [5]:
# load panlex dictionary
with open(PANLEX_EN_ES,'rb') as f:
    en_es_translations = pickle.load(f)

In [6]:
# load vocabulary
en_es_vocab = BilingualVocabulary([], languages = ('en','es'))
with open(VOCAB_EN_ES,'rb') as f:
    en_es_vocab.load_from_index(pickle.load(f))

In [7]:
# confirmations
print('... loaded %s panlex translations'%(len(en_es_translations)))
print('... loaded %s word %svocabulary'%(en_es_vocab.size,en_es_vocab.language))

... loaded 702982 panlex translations
... loaded 200003 word ('en', 'es')vocabulary


### Initialize the model

In [25]:
from models import BiW2V_nn

EMBEDDING_SIZE = 200

# create model
model = BiW2V_nn(bilingual_dict = en_es_translations,
                 vocab = en_es_vocab, H = EMBEDDING_SIZE)

# intialize TF graphs
model.BuildCoreGraph()
model.BuildTrainingGraph()
model.BuildValidationGraph()

... TF graph created for BiW2V model.
... TF graph created for BiW2V training.
... TF graph created for BiW2V validation.


### Training

In [26]:
# training parameters
nBATCHES = 1 # less than 1 epoch
BATCH_SIZE = 48
WINDOW_SIZE = 4
MAX_EPOCHS = 5 # fail safe
DATA_GENERATOR = batch_generator(en_es_data, en_es_vocab, BATCH_SIZE, WINDOW_SIZE, MAX_EPOCHS)
TEST_WORDS = en_es_vocab.to_ids(['en_the','en_last', 'es_mundo', 'es_real'])