# Dictionaries and Vocabularies
`w266 Final Project: Crosslingual Word Embeddings`

Saving Panlex dictionaries & Bilingual Vocabularies so that we don't have to do this over and over.

# Notebook Setup

In [3]:
# general imports
from __future__ import print_function
import time
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# tell matplotlib not to open a new window
%matplotlib inline

# autoreload modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
## Maya's paths
BASE = '/home/mmillervedam' #'/Users/mmillervedam/Documents/MIDS/w266' #
PROJ = '/home/mmillervedam/ProjectRepo' #'/Users/mmillervedam/Documents/MIDS/w266/FinalProject'#

## Roseanna's paths


## Mona's local paths
#BASE = '/Users/mona/OneDrive/repos/Data' #'/home/mmillervedam/Data'
#PROJ = '/Users/mona/OneDrive/repos/final_proj/W266-Fall-2017-Final-Project'#'/home/mmillervedam/ProjectRepo'

In [11]:
# Data
EN_ES_DICT = PROJ +'/XlingualEmb/data/dicts/en.es.panlex.all.processed'
EN_IT_DICT  = PROJ +'/XlingualEmb/data/dicts/en.it.panlex.all.processed'
FULL_EN_ES = "/home/miwamoto/en_es_shuf.txt"
FULL_EN_IT = "/home/miwamoto/en_it_shuf.txt"

__Custom Modules__

In [46]:
from parsing import Corpus, Vocabulary, BilingualVocabulary, batch_generator

# English - Spanish

### Corpus

In [13]:
# load corpus
en_es_data = Corpus(FULL_EN_ES)

In [15]:
# corpus stats
!wc {FULL_EN_ES}

  2676865  69458899 594018304 /home/miwamoto/en_es_shuf.txt


In [17]:
# how many batches is 15 epochs? (w/ batch size 48)
69458899 // 48 * 15

21705900

### Panlex Dictionary

In [23]:
# loading english-spanish dictionary
pld = pd.read_csv(EN_ES_DICT, sep='\t', names = ['en', 'es'], dtype=str)
en_set = set(pld.en.unique())
es_set = set(pld.es.unique())

In [24]:
# dictionary vocab lengths:
print('EN:', len(en_set))
print('ES:', len(es_set))

EN: 356410
ES: 346572


In [25]:
# Create dictionary for ease of runtime translation
# WARNING this takes a sec to run
bi_dict = pld.groupby(['en'])['es'].unique().to_dict()

In [26]:
# add other direction
# WARNING this takes another sec to run
bi_dict.update(pld.groupby(['es'])['en'].unique().to_dict())

Saving to file.

In [31]:
# pickle it - THIS TAKES A MIN
with open(BASE + '/Data/panlex/en_es_dict.pkl','wb') as f:
    pickle.dump(bi_dict, f, pickle.HIGHEST_PROTOCOL)

In [32]:
# confirm it saved - THIS ALSO TAKES A MIN
with open(BASE + '/Data/panlex/en_es_dict.pkl','rb') as f:
    en_es_dict = pickle.load(f)

Take a look.

In [34]:
# demo en to es
en_es_dict['en_the'][:5]

array(['es_el', 'es_el_la_los_las', 'es_entonces', 'es_la', 'es_las'], dtype=object)

In [36]:
# demo it to en
en_es_dict['es_palabra'][:5]

array(['en_conversation', 'en_discussion', 'en_drake', 'en_give-and-take',
       'en_language'], dtype=object)

### Vocabulary

In [48]:
# reload corpus
en_es_data = Corpus(FULL_EN_ES)

In [49]:
# train multilingual Vocabulary
en_es_vocab = BilingualVocabulary(en_es_data.gen_tokens(), 
                                  languages = ('en','es'), 
                                  size = 100000)

In [51]:
# length of corpus vocabulary
en_es_vocab.size

200003

In [92]:
# transition between words in the bilingual vocab
idx1 = 3
idx2 = (en_es_vocab.size - 3) / 2 + 3
print(idx1 - 1 , en_es_vocab.index[idx1 - 1])
print(idx1, en_es_vocab.index[idx1])
print(idx2 - 1 , en_es_vocab.index[idx2 - 1])
print(idx2, en_es_vocab.index[idx2])

2 <unk>
3 en_the
100002 en_pinnae
100003 es_de


In [55]:
# overlap with dictionary vocabulary
len([w for w in en_es_vocab.types if w in en_es_dict])

78490

__Sample of orphaned words:__

In [79]:
def print_orphans(vocab, bi_dict):
    x = 1
    for w in vocab:
        if w not in bi_dict:
            print(w)
            x += 1
        if x > 20:
            break

In [24]:
print_orphans(en_it_vocab.types, bi_dict)

en_manuscripts
en_syro-hittite
en_western-hemisphere
en_migrating
en_conraua
en_aretē
en_tukey
en_trustees
en_240
en_241
en_privileges
en_arab-israeli
en_resembles
en_koyuk
en_schüttler
en_threatened
en_22.3
en_22.2
en_animātiō
en_gamm


__Saving the trained vocabulary__

In [83]:
# save the index to file
with open(BASE + '/Data/vocab/en_es_index.pkl','wb') as f:
    pickle.dump(en_es_vocab.index, f, pickle.HIGHEST_PROTOCOL)

In [84]:
# confirm that it can be reloaded as a dictionary
with open(BASE + '/Data/vocab/en_es_index.pkl','rb') as f:
    en_es_index = pickle.load(f)

In [85]:
# confirm we can re-create the Vocab object
en_es_vocab2 = BilingualVocabulary([],languages=('en','es'))
en_es_vocab2.load_from_index(en_es_index)

In [87]:
en_it_vocab2.to_ids(['en_the', 'en_vocabulary', 'en_works'])

[3, 10754, 353]

---
# English - Italian

### Corpus

In [18]:
# load corpus
en_it_data = Corpus(FULL_EN_IT)

In [19]:
# corpus stats
!wc {FULL_EN_IT}

  10000000  261678882 2270315455 /home/miwamoto/en_it_shuf.txt


In [37]:
# how many batches is 15 epochs? (w/ batch size 48)
261678882 // 48 * 15

81774645

### Panlex Dictionary

In [38]:
# loading english-spanish dictionary
pld = pd.read_csv(EN_IT_DICT, sep='\t', names = ['en', 'it'], dtype=str)
en_set = set(pld.en.unique())
it_set = set(pld.it.unique())

In [39]:
# dictionary vocab lengths:
print('EN:', len(en_set))
print('IT:', len(it_set))

EN: 266450
IT: 258641


In [40]:
# Create dictionary for ease of runtime translation
# WARNING this takes a sec to run
bi_dict = pld.groupby(['en'])['it'].unique().to_dict()

In [41]:
# add other direction
# WARNING this takes another sec to run
bi_dict.update(pld.groupby(['it'])['en'].unique().to_dict())

__Save en-es Panlex Dict to file__

In [43]:
# pickle it - THIS TAKES A MIN
with open(BASE + '/Data/panlex/en_it_dict.pkl','wb') as f:
    pickle.dump(bi_dict, f, pickle.HIGHEST_PROTOCOL)

In [56]:
# confirm it saved - THIS ALSO TAKES A MIN
with open(BASE + '/Data/panlex/en_it_dict.pkl','rb') as f:
    en_it_dict = pickle.load(f)

__Demo Words__

In [57]:
# demo en to it
en_it_dict['en_the'][:5]

array(['it_della', 'it_gli', 'it_i', 'it_il', 'it_la'], dtype=object)

In [58]:
# demo it to en
en_it_dict['it_ciao'][:5]

array(['en_adieu', 'en_bye-bye', 'en_bye', 'en_cheerio', 'en_ciao'], dtype=object)

### Vocabulary

In [59]:
en_it_data = Corpus(FULL_EN_IT)

In [60]:
# train multilingual Vocabulary
en_it_vocab = BilingualVocabulary(en_it_data.gen_tokens(), 
                                  languages = ('en','it'), 
                                  size = 100000)

In [61]:
# length of corpus vocabulary
en_it_vocab.size

200003

In [93]:
# transition between words in the bilingual vocab
idx1 = 3
idx2 = (en_it_vocab.size - 3) / 2 + 3
print(idx1 - 1 , en_it_vocab.index[idx1 - 1])
print(idx1, en_it_vocab.index[idx1])
print(idx2 - 1 , en_it_vocab.index[idx2 - 1])
print(idx2, en_it_vocab.index[idx2])

2 <unk>
3 en_the
100002 en_goldenberg
100003 it_,


In [63]:
# overlap with dictionary vocabulary
len([w for w in en_it_vocab.types if w in en_it_dict])

77593

In [80]:
# take a look at orphaned words
print_orphans(en_it_vocab.types, en_it_dict)

en_deactivated
it_interview
en_hensley
it_raffiguranti
it_triathlon
it_raffigurante
en_sp1
en_intuitionistic
en_androids
en_dilys
en_haters
en_migrating
en_30-27
it_classificherà
en_parris
it_cartoons
it_amazigh
it_intonaci
en_ghrelin
it_antonescu


__Saving the trained vocabulary__

In [75]:
from parsing import BilingualVocabulary

In [67]:
# save the index to file
with open(BASE + '/Data/vocab/en_it_index.pkl','wb') as f:
    pickle.dump(en_it_vocab.index, f, pickle.HIGHEST_PROTOCOL)

In [69]:
# confirm that it can be reloaded as a dictionary
with open(BASE + '/Data/vocab/en_it_index.pkl','rb') as f:
    en_it_index = pickle.load(f)

In [76]:
# confirm we can re-create the Vocab object
en_it_vocab2 = BilingualVocabulary([],languages=('en','it'))
en_it_vocab2.load_from_index(en_it_index)

In [77]:
en_it_vocab2.size

200003

In [78]:
en_it_vocab2.to_ids(['en_the', 'en_vocabulary', 'en_works'])

[3, 10754, 353]