In [1]:
from util import *
from words import *
import numpy as np
import os
import json
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence
from keras.layers import *
from gensim.corpora import Dictionary
import multiprocessing

Using TensorFlow backend.


In [2]:
# model parameters
vocab_size = 100000
max_length = 64
batch_size = 8
embedding_size = 50
hidden = 50
input_file = './datasets/wiki_dataset/wiki_en.txt' # wiki_es.txt is the other file

# loging info
data_dir = './dumps/'
experiment_name ='en'
extra_tokens = {'<PAD>':4, '<START>':2, '<UNK>':1, '<EOS>':3}

In [3]:
folder_path = data_dir+experiment_name+"./"
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [9]:
en_dict = Dictionary.load('./datasets/wiki_dataset/wiki_en.vocab')
print(len(en_dict.token2id))
es_dict = Dictionary.load('./datasets/wiki_dataset/wiki_es.vocab')
print(len(es_dict.token2id))

2011141
2013873


In [8]:
en_dict.merge_with(es_dict)

<gensim.models.VocabTransform at 0x7f1b7c4e48d0>

In [9]:
print(len(en_dict.token2id))
print(len(es_dict.token2id))

3194833
2013873


In [10]:
combined_dict = en_dict
combined_dict.filter_extremes(keep_n=vocab_size, keep_tokens=None)
combined_dict.patch_with_special_tokens(extra_tokens)
print(len(combined_dict.token2id))
combined_dict.token2id['lol']

100004


54058

In [11]:
combined_dict.save('datasets/wiki_dataset/combined_vocab')

In [4]:
combined_dict = Dictionary.load('datasets/wiki_dataset/combined_vocab')

In [5]:
with open(input_file) as f:
    sentences = f.read().split("\n")
len(sentences)

In [141]:
def each(sentence):
    x_ = combined_dict.doc2idx(text_to_word_sequence(sentence), unknown_word_index=combined_dict.token2id['<UNK>'])
    x_.append(combined_dict.token2id['<EOS>'])
    return sequence.pad_sequences([x_], maxlen=max_length, dtype='int32', padding='post', truncating='post',value=combined_dict.token2id['<PAD>'])[0]
each(sentences[45])

In [14]:
# Why can't it share its memory. Use joblib or something thread base if RAM is a bottle-neck
pool = multiprocessing.Pool(processes=40)

In [15]:
x = pool.map(each, (sentence for sentence in sentences))

In [16]:
pool.close()

In [22]:
x = np.array(x)

In [23]:
%%time
np.save('datasets/wiki_dataset/wiki_en_100004_vocab.npy', x)

CPU times: user 0 ns, sys: 28.6 s, total: 28.6 s
Wall time: 31.3 s


In [6]:
x_en = np.load('datasets/wiki_dataset/wiki_en_100004_vocab.npy')
print(x_en.shape)
x_es = np.load('datasets/wiki_dataset/wiki_es_100004_vocab.npy')
print(x_es.shape)
x_en_es = np.load('datasets/wiki_dataset/twitter_en_es_100004_vocab.npy')
print(x_en_es.shape)

(102435443, 64)
(22362530, 64)
(36866, 64)


In [7]:
x_en_es_combined = np.concatenate([x_en_es for _ in range(min(x_es.shape[0],x_en.shape[0])//x_en_es.shape[0] -1)], axis=0)

In [8]:
numwords = len(combined_dict.token2id)

In [138]:
def batch_x(x):
    batches = []
    for i in range(1,len(x)-batch_size,batch_size):
        batches.append(x[i:i+batch_size])
    batches = np.array(batches)
    print(batches.shape)
    return batches

In [98]:
np.random.shuffle(x_en)
np.random.shuffle(x_es)

In [97]:
np.random.shuffle(x_en_es_combined)

In [140]:
# x_en_batched = batch_x(x_en)
# x_es_batched = batch_x(x_es)
# x_en_es_batched = batch_x(x_en_es)

In [46]:
def decode(seq):
    return ' '.join(combined_dict[id_] for id_ in seq)
# print('Finished loading. ', sum([b.shape[0] for b in batches]), ' sentences')

In [185]:
input_ = Input(shape=(None, ))
embedding = Embedding(numwords, 100, input_length=None)
embedded = embedding(input_)

decoder_lstm = LSTM(100, return_sequences=True)
h = decoder_lstm(embedded)

fromhidden = Dense(numwords, activation='linear')
out = TimeDistributed(fromhidden)(h)

model = Model(input_, out)

opt = keras.optimizers.Adam()
lss = sparse_loss

model.compile(opt, lss)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_38 (InputLayer)        (None, None)              0         
_________________________________________________________________
embedding_14 (Embedding)     (None, None, 100)         10000400  
_________________________________________________________________
lstm_37 (LSTM)               (None, None, 100)         80400     
_________________________________________________________________
time_distributed_48 (TimeDis (None, None, 100004)      10100404  
Total params: 20,181,204
Trainable params: 20,181,204
Non-trainable params: 0
_________________________________________________________________


In [53]:
class GiretTwoCell(keras.layers.Layer):

    def __init__(self, cell_1 , cell_2 , nHidden , **kwargs):
        self.cell_1 = cell_1
        self.cell_2 = cell_2
        self.nHidden = nHidden
        self.state_size = [nHidden,nHidden]
        super(GiretTwoCell, self).__init__(**kwargs)

    def build(self, input_shape):
        
        nHidden = self.nHidden
        
        input_shape_n = ( input_shape[0] , input_shape[1]- 2 )
#         print "pp", input_shape_n
        
#         self.cell_1.build(input_shape_n)
#         self.cell_2.build(input_shape_n)
        
        self._trainable_weights += ( self.cell_1.trainable_weights )
        self._trainable_weights += ( self.cell_2.trainable_weights )
        
        self._non_trainable_weights += (  self.cell_1.non_trainable_weights )
        self._non_trainable_weights += (  self.cell_2.non_trainable_weights )
        
        self.built = True

    def call(self, inputs, states):
        
        nHidden = self.nHidden
        
        gate_val_1 = inputs[ : , 0:1]
        gate_val_2 = inputs[ : , 1:2]
        
        inputs  = inputs[ : , 2: ]
                
        gate_val_1 = K.repeat_elements(gate_val_1 , nHidden , -1 ) # shape # bs , hidden
        gate_val_2 = K.repeat_elements(gate_val_2 , nHidden , -1 ) # shape # bs , hidden
        
        _ , [h1 , c1 ]  = self.cell_1.call( inputs , states )
        _ , [h2 , c2 ]  = self.cell_2.call( inputs , states )
        
        h = gate_val_1*h1 + gate_val_2*h2  + (1 - gate_val_1 -  gate_val_2 )*states[0]
        c = gate_val_1*c1 + gate_val_2*c2  + (1 - gate_val_1 -  gate_val_2 )*states[1]
        
        return h, [h , c ]

In [100]:
embed = Embedding(numwords, embedding_size, mask_zero=True)

rnn_en = LSTM(hidden, return_sequences=True)
rnn_hi = LSTM(hidden , return_sequences=True)

       
# en
inp_en = Input((None, ))
x = embed(inp_en)
x = rnn_en(x)
out_en = TimeDistributed(Dense(numwords, activation='linear'))(x)


# hi
inp_hi = Input((None, ))
x = embed(inp_hi)
x = rnn_hi( x )
out_hi = TimeDistributed(Dense(numwords, activation='linear'))(x)


cell_combined = GiretTwoCell(rnn_hi.cell , rnn_en.cell , hidden)

        
inp_enhi = Input((None, ))
x = embed(inp_enhi )

x_att = x
x_att = Bidirectional(LSTM(32 , return_sequences=True))( x )
bider_h = x_att 
x_att = TimeDistributed(Dense(3, activation='softmax') )(x_att)
x_att = Lambda(lambda x : x[... , 1: ])(x_att)

x = Concatenate(-1)([x_att , x ])

x =  RNN(cell_combined , return_sequences=True)(x)
out_enhi = TimeDistributed(Dense(numwords , activation='linear'))(x)
        
model = Model( [inp_hi , inp_en , inp_enhi  ] , [ out_hi , out_en , out_enhi ] ) 

In [109]:
opt = keras.optimizers.Adam(clipvalue=0.4)
lss = sparse_loss

model.compile(loss=sparse_loss, optimizer=opt)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_20 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
input_19 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
input_21 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, None, 50)     5000200     input_19[0][0]                   
                                                                 input_20[0][0]                   
          

In [110]:
tb = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=16, write_graph=False, write_grads=False, write_images=False, embeddings_freq=1, embeddings_layer_names=['embedding_6'], embeddings_metadata=None, embeddings_data=None, update_freq='batch')

In [142]:
model.load_weights(folder_path+"weights")

In [None]:
n = x_en_es_combined.shape[0]
model.fit(
    [x_en[:n,:-1], x_es[:n,:-1], x_en_es_combined[:n,:-1]],
    [x_en[0:n,1:], x_es[:n,1:], x_en_es_combined[:n,1:]], 
    batch_size=16, 
    epochs=1, 
    validation_split=0.1,
    callbacks=[tb],
    shuffle=True
)

Train on 20073537 samples, validate on 2230393 samples
Epoch 1/1
    4192/20073537 [..............................] - ETA: 301:36:11 - loss: 3.9649 - time_distributed_26_loss: 1.6498 - time_distributed_25_loss: 1.7732 - time_distributed_28_loss: 0.5418

In [115]:
model.save_weights(folder_path+"weights")
with open(folder_path+"model.json",'w') as f:
    f.write(str(model.to_json()))

## Text Generation

In [116]:
def sample_logits(preds, temperature=1.0):
    """
    Sample an index from a logit vector.

    :param preds:
    :param temperature:
    :return:
    """
    preds = np.asarray(preds).astype('float64')

    if temperature == 0.0:
        return np.argmax(preds)

    preds = preds / temperature
    preds = preds - logsumexp(preds)

    choice = np.random.choice(len(preds), 1, p=np.exp(preds))

    return choice

In [137]:
def generate_seq(model : Model, seed, size, temperature=1.0):

    ls = seed.shape[0]

    # Due to the way Keras RNNs work, we feed the model a complete sequence each time. At first it's just the seed,
    # zero-padded to the right length. With each iteration we sample and set the next character.

    tokens = np.concatenate([seed, np.zeros(size - ls)])
    token_combined = [tokens, tokens, tokens]

    for i in range(ls, size):

        probs_ = model.predict(token_combined)
        # Extract the i-th probability vector and sample an index from it
        for j, probs in enumerate(probs_):
            probs = probs.reshape((1,probs.shape[0],-1))
            next_token = sample_logits(probs[0, i-1, :], temperature=temperature)
            token_combined[j][i] = next_token

    return [tokens.astype('int') for tokens in token_combined]

In [138]:
seed = x_en[-2][:2]
# seed = np.insert(seed, 0, 2)
a = generate_seq(model, seed, 50, temperature=0.5)

Importing `logsumexp` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.logsumexp` instead.
  from ipykernel import kernelapp as app


In [139]:
print(decode(a[0]))
print(decode(a[1]))
print(decode(a[2]))

meanwhile madhu stereotypes <UNK> <UNK> <UNK> <UNK> <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
meanwhile madhu stereotypes <UNK> <UNK> <UNK> <UNK> <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
meanwhile madhu stereotypes <UNK> <UNK> <UNK> <UNK> <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


# Notes

## Data Pre-processing Note

### Genism
https://www.kdnuggets.com/2017/11/building-wikipedia-text-corpus-nlp.html uses [WikiCorpus of gensim](https://radimrehurek.com/gensim/corpora/wikicorpus.html). I wasn't able to find a way to preseve line ending and that sucks.

### wikiextractor
Using a modified version of https://tiefenauer.github.io/blog/wiki-n-gram-lm/. Uses https://github.com/attardi/wikiextractor in its first step followed by a bash and weird script.
The script being

```
result=$(find ./cleaned_wiki/ -name '*bz2' -exec bzcat {} \+ \
        | pv \
        | tee >(    sed 's/<[^>]*>//g' \
                  | sed 's|["'\''„“‚‘]||g' \
                  | python3 ./wiki_cleaner2.py es >> wiki_es2.txt \
               ) \
        | grep -e "<doc" \
        | wc -l);

```

## news

cat news.es.all | ../normalize-punctuation.perl -l es | ../scripts/tokenizer.perl -l es -no-escape -threads 40 > new.es.all.tok
cat news*es.shuffled > news.es.all 


## Making Vocab

```
from gensim.corpora import WikiCorpus
wiki = WikiCorpus('datasets/wiki_dataset/raw/eswiki-latest-pages-articles-multistream.xml.bz2')
from gensim.corpora import MmCorpus
MmCorpus.serialize('datasets/wiki_dataset/wiki_es.mm', wiki)
wiki.dictionary.save('datasets/wiki_dataset/wiki_es.vocab')
```