# Creating Smaller Vocabularies
`w266 Final Project: Crosslingual Word Embeddings`

Training was taking too long so we're going to drop down to 10K words in each language and restrict them to words that ARE in the Panlex dictionary.

### Notebook Setup

In [1]:
# general imports
from __future__ import print_function
import time
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

# tell matplotlib not to open a new window
%matplotlib inline

# autoreload modules
%load_ext autoreload
%autoreload 2

In [41]:
BASE = '/home/miwamoto/Data'
PROJ = '/home/mmillervedam/ProjectRepo'
#PROJ = '/Users/mona/OneDrive/repos/final_proj/W266-Fall-2017-Final-Project'

# vocabularies
VOCAB_EN_IT = '/home/mmillervedam/Data/vocab/en_it_index.pkl'
VOCAB_EN_NL = BASE + '/vocab/en_nl_index.pkl'
VOCAB_EN_ES = BASE + '/vocab/en_es_index.pkl'
VOCAB_EN_JA = BASE + '/vocab/en_ja_index.pkl'

# panlex dicts
PANLEX_EN_IT = BASE + '/panlex/en_it_dict.pkl'
PANLEX_EN_NL = BASE + '/panlex/en_nl_dict.pkl'
PANLEX_EN_ES = BASE + '/panlex/en_es_dict.pkl'
PANLEX_EN_JA = BASE + '/panlex/en_ja_dict.pkl'

# directory to save pickled vocab indices
SAVE_TO = '/home/mmillervedam/Data/vocab'

### Helper Fxns

In [3]:
def load(panlex_fpath, large_vocab_fpath):
    # load panlex dictionary
    with open(panlex_fpath,'rb') as f:
        translations = pickle.load(f)
    print("... loaded %s Panlex translations"%(len(translations)))
    
    # load larger vocabulary
    with open(large_vocab_fpath,'rb') as f:
        vocab = pickle.load(f)
    nVocab = (len(vocab) - 3)/ 2
    print("... loaded  vocab with %s words per language"%(nVocab))
    
    return (translations, vocab)         

In [7]:
def make_small_vocab(translations, vocab, N):
    """
    Reduce vocab size to N per language.
    And limit to words with translations.
    """
    wordset = set(translations.keys())
    nVocab = (len(vocab) - 3)/ 2
    new_vocab = []
    
    # first language
    idx = 3
    while len(new_vocab) < N:
        wrd = vocab[idx]
        if wrd in wordset:
            new_vocab.append(wrd)
        idx += 1
        if idx > nVocab + 3:
            break
    n1 = len(new_vocab)
    print("... kept %s words from first language"%(n1))
    # second language
    idx = 3 + nVocab
    while len(new_vocab) < 2 * N:
        wrd = vocab[idx]
        if wrd in wordset:
            new_vocab.append(wrd)
        idx += 1
        if idx > 2 * nVocab + 2:
            break
    n2 = len(new_vocab) - n1
    print("... kept %s words from second language"%(n2))
            
    return dict(enumerate(['<s>','</s>','<unk>'] + new_vocab))

# English - Italian

In [None]:
panlex, vocab = load(PANLEX_EN_IT, VOCAB_EN_IT)

In [None]:
en_it_small = make_small_vocab(panlex, vocab, 10000)

In [None]:
# take a look / test
len(en_it_small)

In [None]:
# save to file
with open(SAVE_TO + '/en_it_small.pkl','wb') as f:
    pickle.dump(en_it_small, f, pickle.HIGHEST_PROTOCOL)

# English - Dutch

In [5]:
panlex_nl, vocab_nl = load(PANLEX_EN_NL, VOCAB_EN_NL)

... loaded 437931 Panlex translations
... loaded  vocab with 100000 words per language


In [8]:
en_nl_small = make_small_vocab(panlex_nl, vocab_nl, 10000)

... kept 10000 words from first language
... kept 10000 words from second language


In [9]:
len(en_nl_small)

20003

In [10]:
# save to file
with open(SAVE_TO + '/en_nl_small.pkl','wb') as f:
    pickle.dump(en_nl_small, f, pickle.HIGHEST_PROTOCOL)

# English - Spanish

In [11]:
panlex_es, vocab_es = load(PANLEX_EN_ES, VOCAB_EN_ES)

... loaded 702982 Panlex translations
... loaded  vocab with 100000 words per language


In [12]:
en_es_small = make_small_vocab(panlex_es, vocab_es, 10000)

... kept 10000 words from first language
... kept 10000 words from second language


In [29]:
# first check
len(en_es_small)

20003

In [35]:
# second check
print('es_hoy' in set(en_es_small.values()))
print('es_mundo' in set(en_es_small.values()))

True
True


In [33]:
# third check
en_es_small[10004]

'es_la'

In [20]:
# save to file
with open(SAVE_TO + '/en_es_small.pkl','wb') as f:
    pickle.dump(en_es_small, f, pickle.HIGHEST_PROTOCOL)

# English - Japanese

In [42]:
panlex_ja, vocab_ja = load(PANLEX_EN_JA, VOCAB_EN_JA)

... loaded 634705 Panlex translations
... loaded  vocab with 100000 words per language


In [43]:
en_ja_small = make_small_vocab(panlex_ja, vocab_ja, 10000)

... kept 10000 words from first language
... kept 10000 words from second language


In [44]:
# first check
len(en_ja_small)

20003

In [46]:
# second check
'en_fun' in set(en_ja_small.values())

True

In [47]:
# third check
en_ja_small[10004]

'ja_\xe6\x9c\x88'

In [48]:
# save to file
with open(SAVE_TO + '/en_ja_small.pkl','wb') as f:
    pickle.dump(en_ja_small, f, pickle.HIGHEST_PROTOCOL)