In [56]:
from util import *
from words import *
import numpy as np
import os
import json
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence
from gensim.corpora import Dictionary
from keras.layers import *
import multiprocessing

In [54]:
# model parameters
vocab_size = 100000
max_length = 64
batch_size = 64
input_file = './datasets/code_mixed_es_en_tweets.txt' # wiki_es.txt is the other file
embedding_size = 100
hidden = 100

# loging info
data_dir = './dumps/'
experiment_name ='en_es'
extra_tokens = {'<PAD>':4, '<START>':2, '<UNK>':1, '<EOS>':3}

In [29]:
folder_path = data_dir+experiment_name+"./"
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [4]:
en_dict = Dictionary.load('./datasets/wiki_dataset/wiki_en.vocab')
print(len(en_dict.token2id))
es_dict = Dictionary.load('./datasets/wiki_dataset/wiki_es.vocab')
print(len(es_dict.token2id))

2011141
2013873


In [5]:
en_dict.merge_with(es_dict)

<gensim.models.VocabTransform at 0x7fdbcc031710>

In [6]:
print(len(en_dict.token2id))
print(len(es_dict.token2id))

3194833
2013873


In [7]:
combined_dict = en_dict
combined_dict.filter_extremes(keep_n=vocab_size, keep_tokens=None)
combined_dict.patch_with_special_tokens(extra_tokens)
print(len(combined_dict.token2id))
combined_dict.token2id['lol']

100004


54057

In [8]:
combined_dict.save('datasets/wiki_dataset/combined_vocab')

In [30]:
with open(input_file) as f:
    sentences = f.read().split("\n")
len(sentences)

36866

In [32]:
# ! pip install git+https://github.com/erikavaris/tokenizer.git

In [37]:
from tokenizer import tokenizer
T = tokenizer.TweetTokenizer(preserve_case=False, preserve_handles=False, preserve_hashes=False, regularize=True, preserve_len=False, preserve_emoji=False, preserve_url=False)

tweet = "Hey @NLPer! This is a #NLProc tweet :-D http://www.somelink.com"
tokens = T.tokenize(tweet)
tokens

['hey', '!', 'this', 'is', 'a', 'nlproc', 'tweet', ':-D']

In [43]:
def each(sentence):
    x_ = combined_dict.doc2idx(text_to_word_sequence(' '.join(T.tokenize(sentence))), unknown_word_index=combined_dict.token2id['<UNK>'])
    x_.append(combined_dict.token2id['<EOS>'])
    return sequence.pad_sequences([x_], maxlen=max_length, dtype='int32', padding='post', truncating='post',value=combined_dict.token2id['<PAD>'])[0]
each(sentences[50])

array([    1, 43740,  5827, 79058, 27455, 13075,     1, 21502, 27455,
        5778,     1, 47109, 76985,     1,     1,     1,     1,     1,
           3,     4,     4,     4,     4,     4,     4,     4,     4,
           4,     4,     4,     4,     4,     4,     4,     4,     4,
           4,     4,     4,     4,     4,     4,     4,     4,     4,
           4,     4,     4,     4,     4,     4,     4,     4,     4,
           4,     4,     4,     4,     4,     4,     4,     4,     4,
           4], dtype=int32)

In [44]:
# Why can't it share its memory. Use joblib or something thread base if RAM is a bottle-neck
pool = multiprocessing.Pool(processes=40)

In [45]:
x = pool.map(each, (sentence for sentence in sentences))

In [46]:
pool.close()

In [47]:
# confirm stats
x_max_len = max([len(sentence) for sentence in x])
numwords = len(combined_dict.token2id)
print('max sequence length ', x_max_len)
print(numwords, 'distinct words')

max sequence length  64
100004 distinct words


In [48]:
batches = []
for i in range(1,len(x)-batch_size,batch_size):
    batches.append(x[i:i+batch_size])
batches = np.array(batches)
batches.shape

(576, 64, 64)

In [49]:
def decode(seq):
    return ' '.join(combined_dict[id_] for id_ in seq)
print('Finished loading. ', sum([b.shape[0] for b in batches]), ' sentences')

Finished loading.  36864  sentences


In [21]:
import numpy as np

In [50]:
x = np.array(x)

In [52]:
%%time
np.save('datasets/wiki_dataset/twitter_en_es_100004_vocab.npy', x)

CPU times: user 4 ms, sys: 8 ms, total: 12 ms
Wall time: 12.7 ms


In [None]:
input_ = Input(shape=(None, ))
embedding = Embedding(numwords, 100, input_length=None)
embedded = embedding(input_)

decoder_lstm = LSTM(100, return_sequences=True)
h = decoder_lstm(embedded)

fromhidden = Dense(numwords, activation='linear')
out = TimeDistributed(fromhidden)(h)

model = Model(input_, out)

opt = keras.optimizers.Adam()
lss = sparse_loss

model.compile(opt, lss)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 100)         10000400  
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 100)         80400     
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 100004)      10100404  
Total params: 20,181,204
Trainable params: 20,181,204
Non-trainable params: 0
_________________________________________________________________


In [61]:
def train(epochs = 10):
    epoch = 0
    instances_seen = 0
    while epoch < epochs:

        for batch in tqdm(batches):
            n, l = batch.shape

            batch_shifted = np.concatenate([np.ones((n, 1)), batch], axis=1)  # prepend start symbol
            batch_out = np.concatenate([batch, np.zeros((n, 1))], axis=1)     # append pad symbol

            loss = model.train_on_batch(batch_shifted, batch_out[:, :, None],)
 
            instances_seen += n
#             tbw.add_scalar('lm/batch-loss', float(loss), instances_seen)

        epoch += 1
    
        # Show samples for some sentences from random batches
        for temp in [0.0, 0.9, 1, 1.1, 1.2]:
            print('### TEMP ', temp)
            for i in range(CHECK):
                b = random.choice(batches)

                if b.shape[1] > 20:
                    seed = b[0,:20]
                else:
                    seed = b[0, :]

                seed = np.insert(seed, 0, 1)
                gen = generate_seq(model, seed,  60, temperature=temp)

                print('*** [', decode(seed), '] ', decode(gen[len(seed):]))
train()

  0%|          | 0/576 [00:00<?, ?it/s]


RuntimeError: You must compile a model before training/testing. Use `model.compile(optimizer, loss)`.

In [None]:
model.save_weights(folder_path+"weights")

In [None]:
with open(folder_path+"model.json",'w') as f:
    f.write(str(model.to_json()))

# Notes

## Data Pre-processing Note

### Genism
https://www.kdnuggets.com/2017/11/building-wikipedia-text-corpus-nlp.html uses [WikiCorpus of gensim](https://radimrehurek.com/gensim/corpora/wikicorpus.html). I wasn't able to find a way to preseve line ending and that sucks.

### wikiextractor
Using a modified version of https://tiefenauer.github.io/blog/wiki-n-gram-lm/. Uses https://github.com/attardi/wikiextractor in its first step followed by a bash and weird script.
The script being

```
result=$(find ./cleaned_wiki/ -name '*bz2' -exec bzcat {} \+ \
        | pv \
        | tee >(    sed 's/<[^>]*>//g' \
                  | sed 's|["'\''„“‚‘]||g' \
                  | python3 ./wiki_cleaner2.py es >> wiki_es2.txt \
               ) \
        | grep -e "<doc" \
        | wc -l);

```

## news

cat news.es.all | ../normalize-punctuation.perl -l es | ../scripts/tokenizer.perl -l es -no-escape -threads 40 > new.es.all.tok
cat news*es.shuffled > news.es.all 


## Making Vocab

```
from gensim.corpora import WikiCorpus
wiki = WikiCorpus('datasets/wiki_dataset/raw/eswiki-latest-pages-articles-multistream.xml.bz2')
from gensim.corpora import MmCorpus
MmCorpus.serialize('datasets/wiki_dataset/wiki_es.mm', wiki)
wiki.dictionary.save('datasets/wiki_dataset/wiki_es.vocab')
```

In [53]:
class GiretTwoCell(keras.layers.Layer):

    def __init__(self, cell_1 , cell_2 , nHidden , **kwargs):
        self.cell_1 = cell_1
        self.cell_2 = cell_2
        self.nHidden = nHidden
        self.state_size = [nHidden,nHidden]
        super(GiretTwoCell, self).__init__(**kwargs)

    def build(self, input_shape):
        
        nHidden = self.nHidden
        
        input_shape_n = ( input_shape[0] , input_shape[1]- 2 )
#         print "pp", input_shape_n
        
#         self.cell_1.build(input_shape_n)
#         self.cell_2.build(input_shape_n)
        
        self._trainable_weights += ( self.cell_1.trainable_weights )
        self._trainable_weights += ( self.cell_2.trainable_weights )
        
        self._non_trainable_weights += (  self.cell_1.non_trainable_weights )
        self._non_trainable_weights += (  self.cell_2.non_trainable_weights )
        
        self.built = True

    def call(self, inputs, states):
        
        nHidden = self.nHidden
        
        gate_val_1 = inputs[ : , 0:1]
        gate_val_2 = inputs[ : , 1:2]
        
        inputs  = inputs[ : , 2: ]
                
        gate_val_1 = K.repeat_elements(gate_val_1 , nHidden , -1 ) # shape # bs , hidden
        gate_val_2 = K.repeat_elements(gate_val_2 , nHidden , -1 ) # shape # bs , hidden
        
        _ , [h1 , c1 ]  = self.cell_1.call( inputs , states )
        _ , [h2 , c2 ]  = self.cell_2.call( inputs , states )
        
        h = gate_val_1*h1 + gate_val_2*h2  + (1 - gate_val_1 -  gate_val_2 )*states[0]
        c = gate_val_1*c1 + gate_val_2*c2  + (1 - gate_val_1 -  gate_val_2 )*states[1]
        
        return h, [h , c ]

In [60]:
embed = Embedding(vocab_size, embedding_size, mask_zero=True)

rnn_en = LSTM(hidden, return_sequences=True)
rnn_hi = LSTM(hidden , return_sequences=True)

       
# en
inp_en = Input((None , ))
x = embed(inp_en)
x = rnn_en(x)
out_en = TimeDistributed(Dense(vocab_size, activation='softmax'))(x)


# hi
inp_hi = Input((None, ))
x = embed(inp_hi)
x = rnn_hi( x )
out_hi = TimeDistributed(Dense(vocab_size, activation='softmax'))(x)


cell_combined = GiretTwoCell(rnn_hi.cell , rnn_en.cell , hidden)

        
inp_enhi = Input((None, ))
x = embed(inp_enhi )

x_att = x
x_att = Bidirectional(LSTM(32 , return_sequences=True))( x )
bider_h = x_att 
x_att = TimeDistributed( Dense(3, activation='softmax') )(x_att)
x_att = Lambda(lambda x : x[... , 1: ])(x_att)

x = Concatenate(-1)([x_att , x ])

x =  RNN(cell_combined , return_sequences=True )( x )
out_enhi = TimeDistributed(Dense( vocab_size , activation='softmax'))(x)
        
model = Model( [inp_hi , inp_en , inp_enhi  ] , [ out_hi , out_en , out_enhi ] ) 

In [None]:
model.