# Preprocessing for NMT Model

This example was taken from the wonderful Cutting Edge Deep Learning for Coders course as taught by Jeremy Howard http://course.fast.ai/part2.html The course is now live and I encourage you to check it out.


In [1]:
%matplotlib inline
import importlib
#import sutils; importlib.reload(sutils)
from sutils import *

import keras
import gensim
import re
import pickle
import collections
import keras.backend as K

from keras_tqdm import TQDMNotebookCallback
from keras import initializers
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import *
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, Callback, ReduceLROnPlateau, LearningRateScheduler, EarlyStopping, TensorBoard
from keras.callbacks import LambdaCallback


from recurrentshop import *
import seq2seq
from seq2seq.models import AttentionSeq2Seq,SimpleSeq2Seq, Seq2Seq

import tensorflow as tf

Using TensorFlow backend.


 we will use **gensim** and **word2vec** to get our embeddings for English

In [2]:
from gensim.models import word2vec

In [3]:
limit_gpu_mem()

In [4]:
path='/home/samwit/ai_data_local/neural_translation/'
dpath = '/home/samwit/ai_data_local/neural_translation/translate/'

## Preparing the Corpus

we will make a limited corpus of English Questions and their partners in French

In [5]:
fname=path+'giga-fren.release2.fixed'
en_fname = fname+'.en'
fr_fname = fname+'.fr'

In [6]:
# this creates the Regex for filtering just for questions
re_eq = re.compile('^(Wh[^?.!]+\?)')
re_fq = re.compile('^([^?.!]+\?)')

In [7]:
#this runs our regex search on the full corpus and filters it down
lines = ((re_eq.search(eq), re_fq.search(fq)) 
         for eq, fq in zip(open(en_fname), open(fr_fname)))

Now we want to put them all in a list so that we can easily access them

In [8]:
questions = [(e.group(), f.group()) for e,f in lines if e and f]
len(questions)

52331

In [9]:
questions[5:10]

[('What is the major aboriginal group on Vancouver Island?',
  'Quel est le groupe autochtone principal sur l’île de Vancouver?'),
 ('What are the advantages and disadvantages of using an online atlas versus a paper atlas?',
  'Quels sont les avantages et les désavantages d’utiliser un atlas en ligne comparativement à un atlas en copie papier?'),
 ('What types of land cover are associated with the colours below?',
  'À quel type de couverture des terres associez-vous les couleurs ci-dessous?'),
 ('What is the population of Canada?', 'Quelle est la population du Canada ?'),
 ('Which province is the most populated?',
  'Quelle est la province la plus peuplée ?')]

Now lets save this so we can come back to it in the future

In [10]:
dump(questions, dpath+'questions.pkl')

loading and unwrapping the raw English/French questions

In [7]:
questions = load(dpath+'questions.pkl')
en_qs, fr_qs = zip(*questions)

Next we need to split the questions into tokens so that we can make sequences for the model

In [8]:
re_mult_space = re.compile(r"  *")
re_mw_punc = re.compile(r"(\w[’'])(\w)")
re_punc = re.compile("([\"().,;:/_?!—])")
re_apos = re.compile(r"(\w)'s\b")

In [9]:
def simple_toks(sent):
    sent = re_apos.sub(r"\1 's", sent)
    sent = re_mw_punc.sub(r"\1 \2", sent)
    sent = re_punc.sub(r" \1 ", sent).replace('-', ' ')
    sent = re_mult_space.sub(' ', sent)
    return sent.lower().split()

In [10]:
fr_qtoks = list(map(simple_toks, fr_qs)); fr_qtoks[:4]

[['qu’', 'est', 'ce', 'que', 'la', 'lumière', '?'],
 ['où', 'sommes', 'nous', '?'],
 ["d'", 'où', 'venons', 'nous', '?'],
 ['que', 'ferions', 'nous', 'sans', 'elle', '?']]

In [11]:
en_qtoks = list(map(simple_toks, en_qs)); en_qtoks[:4]

[['what', 'is', 'light', '?'],
 ['who', 'are', 'we', '?'],
 ['where', 'did', 'we', 'come', 'from', '?'],
 ['what', 'would', 'we', 'do', 'without', 'it', '?']]

Now we need to convert tokens to ids so that we can creat lookup tables   

we also insert the "PAD" token in here

this function returns
ids - for words
vocab -  
w2id - is for looking up the 
voc_cnt - the vocab count

In [12]:
def toks2ids(sents):
    voc_cnt = collections.Counter(t for sent in sents for t in sent)
    vocab = sorted(voc_cnt, key=voc_cnt.get, reverse=True)
    vocab.insert(0, "<PAD>")
    w2id = {w:i for i,w in enumerate(vocab)}
    ids = [[w2id[t] for t in sent] for sent in sents]
    return ids, vocab, w2id, voc_cnt

In [13]:
fr_ids, fr_vocab, fr_w2id, fr_counts = toks2ids(fr_qtoks)
en_ids, en_vocab, en_w2id, en_counts = toks2ids(en_qtoks)
len(en_vocab), len(fr_vocab)

(19548, 26708)

#### Sentences converted to vectors

In [14]:
print(en_ids[1])
print(en_qtoks[1])

[18, 9, 41, 1]
['who', 'are', 'we', '?']


#### The look up tables / dictionaries 

In [15]:
en_vocab[18]

'who'

In [16]:
en_w2id['who']

18

## Word Embeddings

here we are going to make look up tables for words to embeddings

The GloVE embeddings used here are 400k words with 100 dimensions

In [17]:

def load_glove(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb'), encoding='latin1'),
        pickle.load(open(loc+'_idx.pkl','rb'), encoding='latin1'))


In [18]:
en_vecs, en_wv_word, en_wv_idx = load_glove('/home/samwit/ai_data_local/embeddings/glove/6B.100d')

In [19]:
en_w2v = {w: en_vecs[en_wv_idx[w]] for w in en_wv_word}

In [20]:
n_en_vec, dim_en_vec = en_vecs.shape
dim_fr_vec = 200

In [21]:
dim_en_vec
n_en_vec

400000

In [22]:
fr_wik = pickle.load(open('/home/samwit/ai_data_local/embeddings/french/polyglot-fr.pkl', 'rb'), 
                     encoding='latin1')

The French embeddings were trained by Jean-Philippe Fauconnier

- Word vectors: http://fauconnier.github.io/index.html#wordembeddingmodels
- frWac: http://wacky.sslmit.unibo.it/doku.php?id=corpora

In [23]:
w2v_path='/home/samwit/ai_data_local/embeddings/french/frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin'

fr_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
fr_voc = fr_model.vocab

In [24]:
def create_emb(w2v, targ_vocab, dim_vec):
    vocab_size = len(targ_vocab)
    emb = np.zeros((vocab_size, dim_vec))

    for i, word in enumerate(targ_vocab):
        try:
            emb[i] = w2v[word]
        except KeyError:
            # If we can't find the word, randomly initialize
            emb[i] = normal(scale=0.6, size=(dim_vec,))

    return emb

In [25]:
en_embs = create_emb(en_w2v, en_vocab, dim_en_vec); en_embs.shape

(19548, 100)

In [26]:
fr_embs = create_emb(fr_model, fr_vocab, dim_fr_vec); fr_embs.shape

(26708, 200)

## Data checks

In [27]:
en_lengths = collections.Counter(len(s) for s in en_ids)

### Keras pad_sequences 

In [28]:
maxlen = 30

In [29]:
en_padded = pad_sequences(en_ids, maxlen, padding="post", truncating="post")

In [30]:
fr_padded = pad_sequences(fr_ids, maxlen, padding="post", truncating="post")

In [31]:
en_padded.shape, fr_padded.shape, en_embs.shape

((52331, 30), (52331, 30), (19548, 100))

In [32]:
n = int(len(en_ids)*0.9)
idxs = np.random.permutation(len(en_ids))
fr_train, fr_test = fr_padded[idxs][:n], fr_padded[idxs][n:]
en_train, en_test = en_padded[idxs][:n], en_padded[idxs][n:]

In [33]:
en_train[0]

array([   18,    52,  3697, 12345,    10,     2,  3823,  2045,   765,
           2,  1901,    31,     2, 16686,     1,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0], dtype=int32)

## Saving the data

In [34]:
# needed to save  
look_ups = {'en_w2id':en_w2id,'fr_vocab':fr_vocab,'en_vocab':en_vocab, 'en_embs':en_embs,'fr_embs':fr_embs}
dump(look_ups, dpath+'look_ups.pkl')

In [35]:
data={'fr_train':fr_train,'en_train':en_train,'fr_test':fr_test,'en_test':en_test,}
dump(data, dpath+'nmt_data.pkl')