# Dictionaries and Vocabularies
`w266 Final Project: Crosslingual Word Embeddings`

Saving Panlex dictionaries & Bilingual Vocabularies so that we don't have to do this over and over.

# Notebook Setup

In [64]:
# general imports
from __future__ import print_function
import time
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# tell matplotlib not to open a new window
%matplotlib inline

# autoreload modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [65]:
## Maya's paths
#BASE = '/home/mmillervedam' #'/Users/mmillervedam/Documents/MIDS/w266' #
#PROJ = '/home/mmillervedam/ProjectRepo' #'/Users/mmillervedam/Documents/MIDS/w266/FinalProject'#

## Roseanna's paths

## Mona's gc paths
BASE = '/home/miwamoto' #'/home/mmillervedam/Data'
PROJ = '/home/miwamoto/W266-Fall-2017-Final-Project'#'/home/mmillervedam/ProjectRepo'

## Mona's local paths
#BASE = '/Users/mona/OneDrive/repos/Data' #'/home/mmillervedam/Data'
#PROJ = '/Users/mona/OneDrive/repos/final_proj/W266-Fall-2017-Final-Project'#'/home/mmillervedam/ProjectRepo'

In [67]:
# Data
EN_ES_DICT = PROJ +'/XlingualEmb/data/dicts/en.es.panlex.all.processed'
EN_IT_DICT = PROJ +'/XlingualEmb/data/dicts/en.it.panlex.all.processed'
EN_NL_DICT = PROJ +'/XlingualEmb/data/dicts/en.nl.panlex.all.processed'
EN_JA_DICT = PROJ +'/XlingualEmb/data/dicts/en.ja.panlex.all.processed'
FULL_EN_ES = "/home/miwamoto/en_es_shuf.txt"
FULL_EN_IT = "/home/miwamoto/en_it_shuf.txt"
FULL_EN_NL = "/home/miwamoto/en_nl_shuf.txt"
FULL_EN_JA = "/home/miwamoto/en_ja_shuf.txt"

EN_IT = PROJ + '/XlingualEmb/data/mono/en_it.shuf.10k'


__Custom Modules__

In [68]:
from parsing import Corpus, Vocabulary, BilingualVocabulary, batch_generator

# English - Spanish

### Corpus

In [None]:
# load corpus
en_es_data = Corpus(FULL_EN_ES)

In [None]:
# corpus stats
!wc {FULL_EN_ES}

### Panlex Dictionary

In [None]:
# loading english-spanish dictionary
pld = pd.read_csv(EN_ES_DICT, sep='\t', names = ['en', 'es'], dtype=str)
en_set = set(pld.en.unique())
es_set = set(pld.es.unique())

In [None]:
# dictionary vocab lengths:
print('EN:', len(en_set))
print('ES:', len(es_set))

In [None]:
# Create dictionary for ease of runtime translation
# WARNING this takes a sec to run
bi_dict = pld.groupby(['en'])['es'].unique().to_dict()

In [None]:
# add other direction
# WARNING this takes another sec to run
bi_dict.update(pld.groupby(['es'])['en'].unique().to_dict())

Saving to file.

In [None]:
# pickle it - THIS TAKES A MIN
with open(BASE + '/Data/panlex/en_es_dict.pkl','wb') as f:
    pickle.dump(bi_dict, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# confirm it saved - THIS ALSO TAKES A MIN
with open(BASE + '/Data/panlex/en_es_dict.pkl','rb') as f:
    en_es_dict = pickle.load(f)

Take a look.

In [None]:
# demo en to es
en_es_dict['en_the'][:5]

In [None]:
# demo it to en
en_es_dict['es_palabra'][:5]

### Vocabulary

In [None]:
# reload corpus
en_es_data = Corpus(FULL_EN_ES)

In [None]:
# train multilingual Vocabulary
en_es_vocab = BilingualVocabulary(en_es_data.gen_tokens(), 
                                  languages = ('en','es'), 
                                  size = 100000)

In [None]:
# length of corpus vocabulary
en_es_vocab.size

In [None]:
for i in range (99980,100020):
    print(en_es_vocab.index[i])

In [None]:
# transition between words in the bilingual vocab
idx1 = 3
idx2 = (en_es_vocab.size - 3) / 2 + 3
print(idx1 - 1 , en_es_vocab.index[idx1 - 1])
print(idx1, en_es_vocab.index[idx1])
print(idx2 - 1 , en_es_vocab.index[idx2 - 1])
print(idx2, en_es_vocab.index[idx2])

In [None]:
# overlap with dictionary vocabulary
len([w for w in en_es_vocab.types if w in en_es_dict])

__Sample of orphaned words:__

In [28]:
def print_orphans(vocab, bi_dict):
    x = 1
    for w in vocab:
        if w not in bi_dict:
            print(w)
            x += 1
        if x > 20:
            break

In [None]:
print_orphans(en_es_vocab.types, bi_dict)

__Saving the trained vocabulary__

In [None]:
# save the index to file
with open(BASE + '/Data/vocab/en_es_index.pkl','wb') as f:
    pickle.dump(en_es_vocab.index, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# confirm that it can be reloaded as a dictionary
with open(BASE + '/Data/vocab/en_es_index.pkl','rb') as f:
    en_es_index = pickle.load(f)

In [None]:
# confirm we can re-create the Vocab object
en_es_vocab2 = BilingualVocabulary([],languages=('en','es'))
en_es_vocab2.load_from_index(en_es_index)

In [None]:
en_es_vocab2.to_ids(['en_the', 'en_vocabulary', 'en_works'])

---
# English - Italian

### Corpus

In [116]:
# load corpus
en_it_data = Corpus(FULL_EN_IT)

In [117]:
# corpus stats
!wc {FULL_EN_IT}

  10000000  261678882 2270315455 /home/miwamoto/en_it_shuf.txt


### Panlex Dictionary

In [None]:
# loading english-spanish dictionary
pld = pd.read_csv(EN_IT_DICT, sep='\t', names = ['en', 'it'], dtype=str)
en_set = set(pld.en.unique())
it_set = set(pld.it.unique())

In [None]:
# dictionary vocab lengths:
print('EN:', len(en_set))
print('IT:', len(it_set))

In [None]:
# Create dictionary for ease of runtime translation
# WARNING this takes a sec to run
bi_dict = pld.groupby(['en'])['it'].unique().to_dict()

In [None]:
# add other direction
# WARNING this takes another sec to run
bi_dict.update(pld.groupby(['it'])['en'].unique().to_dict())

__Save en-es Panlex Dict to file__

In [None]:
# pickle it - THIS TAKES A MIN
with open(BASE + '/Data/panlex/en_it_dict.pkl','wb') as f:
    pickle.dump(bi_dict, f, pickle.HIGHEST_PROTOCOL)

In [118]:
# confirm it saved - THIS ALSO TAKES A MIN
with open(BASE + '/Data/panlex/en_it_dict.pkl','rb') as f:
    en_it_dict = pickle.load(f)

__Demo Words__

In [None]:
# demo en to it
en_it_dict['en_the'][:5]

In [None]:
# demo it to en
en_it_dict['it_ciao'][:5]

### Vocabulary

In [None]:
en_it_data = Corpus(FULL_EN_IT)

In [None]:
# train multilingual Vocabulary
en_it_vocab = BilingualVocabulary(en_it_data.gen_tokens(), 
                                  languages = ('en','it'), 
                                  size = 100000)

In [None]:
# length of corpus vocabulary
en_it_vocab.size

In [None]:
# transition between words in the bilingual vocab
idx1 = 3
idx2 = (en_it_vocab.size - 3) / 2 + 3
print(idx1 - 1 , en_it_vocab.index[idx1 - 1])
print(idx1, en_it_vocab.index[idx1])
print(idx2 - 1 , en_it_vocab.index[idx2 - 1])
print(idx2, en_it_vocab.index[idx2])

In [None]:
# overlap with dictionary vocabulary
len([w for w in en_it_vocab.types if w in en_it_dict])

In [None]:
# take a look at orphaned words
print_orphans(en_it_vocab.types, en_it_dict)

__Saving the trained vocabulary__

In [None]:
from parsing import BilingualVocabulary

In [None]:
# save the index to file
with open(BASE + '/Data/vocab/en_it_index.pkl','wb') as f:
    pickle.dump(en_it_vocab.index, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# confirm that it can be reloaded as a dictionary
with open(BASE + '/Data/vocab/en_it_index.pkl','rb') as f:
    en_it_index = pickle.load(f)

In [None]:
# confirm we can re-create the Vocab object
en_it_vocab2 = BilingualVocabulary([],languages=('en','it'))
en_it_vocab2.load_from_index(en_it_index)

In [None]:
en_it_vocab2.size

In [None]:
en_it_vocab2.to_ids(['en_the', 'en_vocabulary', 'en_works'])

---
# English - Italian 10K

### Corpus

In [69]:
# load corpus
en_it_data = Corpus(EN_IT)

In [70]:
# corpus stats
!wc {EN_IT}

  20000  430887 3746786 /home/miwamoto/W266-Fall-2017-Final-Project/XlingualEmb/data/mono/en_it.shuf.10k


### Panlex Dictionary

In [83]:
# loading english-spanish dictionary
pld = pd.read_csv(EN_IT_DICT, sep='\t', names = ['en', 'it'], dtype=str)
en_set = set(pld.en.unique())
it_set = set(pld.it.unique())

In [84]:
# dictionary vocab lengths:
print('EN:', len(en_set))
print('IT:', len(it_set))

EN: 266450
IT: 258641


In [85]:
# Create dictionary for ease of runtime translation
# WARNING this takes a sec to run
bi_dict = pld.groupby(['en'])['it'].unique().to_dict()

In [86]:
# add other direction
# WARNING this takes another sec to run
bi_dict.update(pld.groupby(['it'])['en'].unique().to_dict())

__Save en-es Panlex Dict to file__

In [87]:
# pickle it - THIS TAKES A MIN
with open(BASE + '/Data/panlex/en_it_dict.pkl','wb') as f:
    pickle.dump(bi_dict, f, pickle.HIGHEST_PROTOCOL)

In [88]:
# confirm it saved - THIS ALSO TAKES A MIN
with open(BASE + '/Data/panlex/en_it_dict.pkl','rb') as f:
    en_it_dict = pickle.load(f)

__Demo Words__

In [89]:
# demo en to it
en_it_dict['en_the'][:5]

array(['it_della', 'it_gli', 'it_i', 'it_il', 'it_la'], dtype=object)

In [90]:
# demo it to en
en_it_dict['it_ciao'][:5]

array(['en_adieu', 'en_bye-bye', 'en_bye', 'en_cheerio', 'en_ciao'], dtype=object)

### Vocabulary

In [91]:
en_it_data = Corpus(EN_IT)

In [92]:
# train multilingual Vocabulary
en_it_vocab = BilingualVocabulary(en_it_data.gen_tokens(), 
                                  languages = ('en','it'), 
                                  size = 100000)

In [93]:
# length of corpus vocabulary
en_it_vocab.size

48579

In [94]:
# transition between words in the bilingual vocab
idx1 = 3
idx2 = (en_it_vocab.size - 3) / 2 + 3
print(idx1 - 1 , en_it_vocab.index[idx1 - 1])
print(idx1, en_it_vocab.index[idx1])
print(idx2 - 1 , en_it_vocab.index[idx2 - 1])
print(idx2, en_it_vocab.index[idx2])

2 <unk>
3 en_the
24290 it_edizioni
24291 it_quarto


In [107]:
for i in range (23220,23240):
    print(en_it_vocab.index[i])

en_atari
en_10646
en_cylindrical
en_canyon
en_ionia
en_commodified
it_,
it_di
it_.
it_e
it_il
it_la
it_
it_in
it_del
it_a
it_che
it_della
it_è
it_un


In [108]:
# overlap with dictionary vocabulary
len([w for w in en_it_vocab.types if w in en_it_dict])

24176

In [109]:
# take a look at orphaned words
print_orphans(en_it_vocab.types, en_it_dict)

it_spunti
it_integrali
en_manuscripts
it_giostrando
en_syro-hittite
it_l'arco
it_janue
it_[[878172]]
it_[[879051]]
it_")
it_raffiguranti
it_promettenti
it_margaria
en_western-hemisphere
it_gallizio
en_migrating
en_conraua
en_aretē
en_tukey
it_ripubblicate


__Saving the trained vocabulary__

In [110]:
from parsing import BilingualVocabulary

In [111]:
# save the index to file
with open(BASE + '/Data/vocab/en_it_10K_index.pkl','wb') as f:
    pickle.dump(en_it_vocab.index, f, pickle.HIGHEST_PROTOCOL)

In [112]:
# confirm that it can be reloaded as a dictionary
with open(BASE + '/Data/vocab/en_it_10K_index.pkl','rb') as f:
    en_it_index = pickle.load(f)

In [113]:
# confirm we can re-create the Vocab object
en_it_vocab2 = BilingualVocabulary([],languages=('en','it'))
en_it_vocab2.load_from_index(en_it_index)

In [114]:
en_it_vocab2.size

48579

In [115]:
en_it_vocab2.to_ids(['en_the', 'en_vocabulary', 'en_works'])

[3, 3979, 248]

---
# English - Japanese

### Corpus

In [11]:
# load corpus
en_ja_data = Corpus(FULL_EN_JA)

In [12]:
# corpus stats
!wc {FULL_EN_JA}

  10000000  162715052 1447070927 /home/miwamoto/en_ja_shuf.txt


### Panlex Dictionary

In [14]:
# loading english-spanish dictionary
pld = pd.read_csv(EN_JA_DICT, sep='\t', names = ['en', 'ja'], dtype=str)
en_set = set(pld.en.unique())
ja_set = set(pld.ja.unique())

In [15]:
# dictionary vocab lengths:
print('EN:', len(en_set))
print('JA:', len(ja_set))

EN: 308933
JA: 325772


In [16]:
# Create dictionary for ease of runtime translation
# WARNING this takes a sec to run
bi_dict = pld.groupby(['en'])['ja'].unique().to_dict()

In [17]:
# add other direction
# WARNING this takes another sec to run
bi_dict.update(pld.groupby(['ja'])['en'].unique().to_dict())

__Save en-es Panlex Dict to file__

In [18]:
# pickle it - THIS TAKES A MIN
with open(BASE + '/Data/panlex/en_ja_dict.pkl','wb') as f:
    pickle.dump(bi_dict, f, pickle.HIGHEST_PROTOCOL)

In [19]:
# confirm it saved - THIS ALSO TAKES A MIN
with open(BASE + '/Data/panlex/en_ja_dict.pkl','rb') as f:
    en_ja_dict = pickle.load(f)

__Demo Words__

In [20]:
# demo en to it
en_ja_dict['en_the'][:5]

array(['ja_\xe3\x82\xb6', 'ja_\xe5\x85\xb6\xe3\x81\xae'], dtype=object)

In [21]:
# demo it to en
en_ja_dict['ja_sore'][:5]

array(['en_it', 'en_that'], dtype=object)

### Vocabulary

In [22]:
en_ja_data = Corpus(FULL_EN_JA)

In [23]:
# train multilingual Vocabulary
en_ja_vocab = BilingualVocabulary(en_ja_data.gen_tokens(), 
                                  languages = ('en','ja'), 
                                  size = 100000)

In [24]:
# length of corpus vocabulary
en_ja_vocab.size

200003

In [25]:
# transition between words in the bilingual vocab
idx1 = 3
idx2 = (en_ja_vocab.size - 3) / 2 + 3
print(idx1 - 1 , en_ja_vocab.index[idx1 - 1])
print(idx1, en_ja_vocab.index[idx1])
print(idx2 - 1 , en_ja_vocab.index[idx2 - 1])
print(idx2, en_ja_vocab.index[idx2])

2 <unk>
3 en_the
100002 en_espejo
100003 ja_年


In [26]:
# overlap with dictionary vocabulary
len([w for w in en_ja_vocab.types if w in en_ja_dict])

87035

In [29]:
# take a look at orphaned words
print_orphans(en_ja_vocab.types, en_ja_dict)

ja_vanilla
en_deactivated
en_hensley
ja_タンチノープル
ja_ロズベルグ
en_sp1
ja_末松
en_intuitionistic
ja_平泉
ja_トトロ
en_dilys
en_haters
en_migrating
en_30-27
ja_静内川
en_parris
ja_山科
ja_eating
ja_オルバッツァーノ
en_time-travel


__Saving the trained vocabulary__

In [30]:
# save the index to file
with open(BASE + '/Data/vocab/en_ja_index.pkl','wb') as f:
    pickle.dump(en_ja_vocab.index, f, pickle.HIGHEST_PROTOCOL)

In [31]:
# confirm that it can be reloaded as a dictionary
with open(BASE + '/Data/vocab/en_ja_index.pkl','rb') as f:
    en_ja_index = pickle.load(f)

In [33]:
# confirm we can re-create the Vocab object
en_ja_vocab2 = BilingualVocabulary([],languages=('en','ja'))
en_ja_vocab2.load_from_index(en_ja_index)

In [34]:
en_ja_vocab2.size

200003

In [35]:
en_ja_vocab2.to_ids(['en_the', 'en_vocabulary', 'en_works'])

[3, 10672, 353]

---
# English - Dutch

### Corpus

In [37]:
# load corpus
en_nl_data = Corpus(FULL_EN_NL)

In [38]:
# corpus stats
!wc {FULL_EN_NL}

   9997288  218679772 1885752438 /home/miwamoto/en_nl_shuf.txt


### Panlex Dictionary

In [39]:
# loading english-spanish dictionary
pld = pd.read_csv(EN_NL_DICT, sep='\t', names = ['en', 'nl'], dtype=str)
en_set = set(pld.en.unique())
nl_set = set(pld.nl.unique())

In [40]:
# dictionary vocab lengths:
print('EN:', len(en_set))
print('NL:', len(nl_set))

EN: 220439
NL: 217492


In [41]:
# Create dictionary for ease of runtime translation
# WARNING this takes a sec to run
bi_dict = pld.groupby(['en'])['nl'].unique().to_dict()

In [42]:
# add other direction
# WARNING this takes another sec to run
bi_dict.update(pld.groupby(['nl'])['en'].unique().to_dict())

__Save en-es Panlex Dict to file__

In [43]:
# pickle it - THIS TAKES A MIN
with open(BASE + '/Data/panlex/en_nl_dict.pkl','wb') as f:
    pickle.dump(bi_dict, f, pickle.HIGHEST_PROTOCOL)

In [47]:
# confirm it saved - THIS ALSO TAKES A MIN
with open(BASE + '/Data/panlex/en_nl_dict.pkl','rb') as f:
    en_nl_dict = pickle.load(f)

__Demo Words__

In [48]:
# demo en to it
en_nl_dict['en_the'][:5]

array(['nl_aan_de', 'nl_aan_het', 'nl_de', 'nl_des_te', 'nl_die'], dtype=object)

In [49]:
# demo it to en
en_nl_dict['nl_hij'][:5]

array(['en_he', 'en_him', 'en_him_y', 'en_i', 'en_it'], dtype=object)

### Vocabulary

In [50]:
en_nl_data = Corpus(FULL_EN_NL)

In [51]:
# train multilingual Vocabulary
en_nl_vocab = BilingualVocabulary(en_nl_data.gen_tokens(), 
                                  languages = ('en','nl'), 
                                  size = 100000)

In [52]:
# length of corpus vocabulary
en_nl_vocab.size

200003

In [53]:
# transition between words in the bilingual vocab
idx1 = 3
idx2 = (en_nl_vocab.size - 3) / 2 + 3
print(idx1 - 1 , en_nl_vocab.index[idx1 - 1])
print(idx1, en_nl_vocab.index[idx1])
print(idx2 - 1 , en_nl_vocab.index[idx2 - 1])
print(idx2, en_nl_vocab.index[idx2])

2 <unk>
3 en_the
100002 en_espejo
100003 nl_de


In [56]:
# overlap with dictionary vocabulary
len([w for w in en_nl_vocab.types if w in en_nl_dict])

70475

In [57]:
# take a look at orphaned words
print_orphans(en_nl_vocab.types, en_nl_dict)

nl_||-||-||-||
nl_klippen
en_deactivated
nl_conescharellina
nl_sericeus
nl_meededen
nl_falcata
nl_bierman
en_sp1
en_intuitionistic
en_androids
nl_silke
en_dilys
en_haters
en_migrating
en_30-27
en_downplay
en_gravelly
nl_onderduikadres
nl_outsider


__Saving the trained vocabulary__

In [58]:
from parsing import BilingualVocabulary

In [59]:
# save the index to file
with open(BASE + '/Data/vocab/en_nl_index.pkl','wb') as f:
    pickle.dump(en_nl_vocab.index, f, pickle.HIGHEST_PROTOCOL)

In [60]:
# confirm that it can be reloaded as a dictionary
with open(BASE + '/Data/vocab/en_nl_index.pkl','rb') as f:
    en_nl_index = pickle.load(f)

In [61]:
# confirm we can re-create the Vocab object
en_nl_vocab2 = BilingualVocabulary([],languages=('en','nl'))
en_nl_vocab2.load_from_index(en_nl_index)

In [62]:
en_nl_vocab2.size

200003

In [63]:
en_nl_vocab2.to_ids(['en_the', 'en_vocabulary', 'en_works'])

[3, 10672, 353]

## Get Common Words

In [147]:
# Test returning a set of common words in a vocabulary and
# corresponding Ground Truth files

from parsing import get_common_words

# Test with
with open(BASE + '/Data/vocab/en_it_index.pkl','rb') as f:
    en_it_index = pickle.load(f)

en_it_vocab2 = BilingualVocabulary([],languages=('en','it'))
en_it_vocab2.load_from_index(en_it_index)

common_words = get_common_words(en_it_vocab2)

In [148]:
print(len(truth))

27107


