In [13]:
from util import *
from words import *
import numpy as np
import os
import json
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence
from keras.layers import *
from gensim.corpora import Dictionary
import multiprocessing
import random

In [14]:
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

In [15]:
# model parameters
vocab_size = 100000
max_length = 64
batch_size = 8
embedding_size = 50
hidden = 50
input_file = './datasets/wiki_dataset/wiki_en.txt' # wiki_es.txt is the other file

# loging info
data_dir = './dumps/'
experiment_name ='en_en_MUSE2'
extra_tokens = {'<PAD>':4, '<START>':2, '<UNK>':1, '<EOS>':3}

In [16]:
folder_path = data_dir+experiment_name+"./"
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [5]:
en_dict = Dictionary.load('./datasets/wiki_dataset/wiki_en.vocab')
print(len(en_dict.token2id))
es_dict = Dictionary.load('./datasets/wiki_dataset/wiki_es.vocab')
print(len(es_dict.token2id))

2011141
2013873


In [None]:
en_dict.merge_with(es_dict)

In [9]:
print(len(en_dict.token2id))
print(len(es_dict.token2id))

3194833
2013873


In [10]:
combined_dict = en_dict
combined_dict.filter_extremes(keep_n=vocab_size, keep_tokens=None)
combined_dict.patch_with_special_tokens(extra_tokens)
print(len(combined_dict.token2id))
combined_dict.token2id['lol']

100004


54058

In [11]:
combined_dict.save('datasets/wiki_dataset/combined_vocab')

In [4]:
combined_dict = Dictionary.load('datasets/wiki_dataset/combined_vocab')

In [5]:
with open(input_file) as f:
    sentences = f.read().split("\n")
len(sentences)

In [141]:
def each(sentence):
    x_ = combined_dict.doc2idx(text_to_word_sequence(sentence), unknown_word_index=combined_dict.token2id['<UNK>'])
    x_.append(combined_dict.token2id['<EOS>'])
    return sequence.pad_sequences([x_], maxlen=max_length, dtype='int32', padding='post', truncating='post',value=combined_dict.token2id['<PAD>'])[0]
each(sentences[45])

In [14]:
# Why can't it share its memory. Use joblib or something thread base if RAM is a bottle-neck
pool = multiprocessing.Pool(processes=40)

In [15]:
x = pool.map(each, (sentence for sentence in sentences))

In [16]:
pool.close()

In [22]:
x = np.array(x)

In [23]:
%%time
np.save('datasets/wiki_dataset/wiki_en_100004_vocab.npy', x)

CPU times: user 0 ns, sys: 28.6 s, total: 28.6 s
Wall time: 31.3 s


In [6]:
x_en = np.load('datasets/wiki_dataset/wiki_en_100004_vocab.npy')
print(x_en.shape)
x_es = np.load('datasets/wiki_dataset/wiki_es_100004_vocab.npy')
print(x_es.shape)
x_en_es = np.load('datasets/wiki_dataset/twitter_en_es_100004_vocab.npy')
print(x_en_es.shape)

(102435443, 64)
(22362530, 64)
(36866, 64)


In [7]:
x_en_es_combined = np.concatenate([x_en_es for _ in range(min(x_es.shape[0],x_en.shape[0])//x_en_es.shape[0] -1)], axis=0)

In [8]:
numwords = len(combined_dict.token2id)

In [138]:
def batch_x(x):
    batches = []
    for i in range(1,len(x)-batch_size,batch_size):
        batches.append(x[i:i+batch_size])
    batches = np.array(batches)
    print(batches.shape)
    return batches

In [326]:
np.random.shuffle(x_en)
np.random.shuffle(x_es)

In [327]:
np.random.shuffle(x_en_es_combined)

In [140]:
# x_en_batched = batch_x(x_en)
# x_es_batched = batch_x(x_es)
# x_en_es_batched = batch_x(x_en_es)

In [46]:
def decode(seq):
    return ' '.join(combined_dict[id_] for id_ in seq)
# print('Finished loading. ', sum([b.shape[0] for b in batches]), ' sentences')

## Loading MUSE Embeddings

In [18]:
import io
import numpy as np

In [19]:
## essential functions
def load_vec(emb_path, nmax=50000):
    word2id = {'<pad>':0, '<unk>':1, '<sos>':2, '<eos>':3}
    vectors = [np.zeros(300) for _ in range(len(word2id))]
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [20]:
src_path = '../MUSE/dumped/6pzywzu6yg/vectors-en.txt'
tgt_path = '..//MUSE/dumped/6pzywzu6yg/vectors-es.txt'
nmax = 100000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

### merge embedding

In [21]:
from keras.preprocessing import sequence
from keras.utils import to_categorical

In [22]:
def merge_embeddings(src_embeddings, tgt_embeddings):
    
    # make combined embedding mattrix
    embedding_matrix = src_embeddings.copy().tolist()
    embedding_matrix.extend(tgt_embeddings.tolist())
    embedding_matrix = np.array(embedding_matrix)
    
    # make combined id2word and word2id
    id2word = src_id2word.copy()
    word2id = src_word2id.copy()
    
    next_id = len(id2word.keys())
    counter = len(id2word.keys())
    
    to_be_removed_id = []
    common_words = []
    
    for key in tgt_id2word:
        if tgt_id2word[key] in word2id:
            to_be_removed_id.append(counter)
            common_words.append(tgt_id2word[key])
            embedding_matrix[word2id[tgt_id2word[key]]] =  (embedding_matrix[word2id[tgt_id2word[key]]] + embedding_matrix[counter])/2
        else:
            id2word[next_id] = tgt_id2word[key]
            word2id[tgt_id2word[key]] = next_id
            next_id += 1
        counter += 1
        
    embedding_matrix = np.delete(embedding_matrix, to_be_removed_id, axis=0)
        
    return embedding_matrix, id2word, word2id, common_words

In [23]:
embedding_matrix, id2word, word2id, common_words = merge_embeddings(src_embeddings, tgt_embeddings)

In [24]:
print("embedding size: ", str(embedding_matrix.shape))
print("Number of common words in both the embedding %d" % len(common_words))

embedding size:  (161829, 300)
Number of common words in both the embedding 38171


In [25]:
import json
with open(folder_path+"vocab", "w") as f:
        json.dump(word2id,f)

In [26]:
def decode(seq):
    return ' '.join(id2word[id_] for id_ in seq)

## Reading FILES

In [13]:
def each(sentence, max_length=32, tokeniser=text_to_word_sequence):
    sentence =  tokeniser(sentence)
    new_sent = [word2id['<sos>']] 
    for word in sentence:
        try:
            new_sent.append(word2id[word])
        except:
            new_sent.append(word2id['<unk>'])
    
    new_sent.append(word2id['<eos>']) 
    if len(new_sent) > max_length:
        new_sent = new_sent[:max_length]
    else:
        while len(new_sent) < max_length:
            new_sent.append(word2id['<pad>'])
    return new_sent
each("Lol this is not fun is it ? hahh what", 20)

[2, 13989, 30, 17, 36, 3829, 17, 25, 1, 123, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [14]:
def process_file(file_name, doc2ids=each, pool=None):
    with open(file_name) as f:
        sentences = f.read().split("\n")
    print("no of sentencecs: ", len(sentences))
    answer = None
    if pool is None:
        pool = multiprocessing.Pool(processes=40)
        answer = pool.map(doc2ids, (sentence for sentence in sentences))
        pool.close()
    else:
        answer = pool.map(doc2ids, (sentence for sentence in sentences))
    return answer

### EUROPAL FILES

In [15]:
pool = multiprocessing.Pool(processes=40)
sentence_en = process_file('./datasets/wmt11/training-monolingual/europarl-v6.en',pool=pool)
sentence_es = process_file('./datasets/wmt11/training-monolingual/europarl-v6.es',pool=pool)
sentence_es_en = process_file('./datasets/wmt11/code_mixed_es_en.txt.tok',pool=pool)
pool.close()

no of sentencecs:  2015441
no of sentencecs:  1927758
no of sentencecs:  39317


In [16]:
## WiKi Files

In [17]:
sentence_en = np.array(sentence_en)
print(sentence_en.shape)
sentence_es = np.array(sentence_es)
print(sentence_es.shape)
sentence_es_en = np.array(sentence_es_en)
print(sentence_es_en.shape)

(2015441, 32)
(1927758, 32)
(39317, 32)


In [18]:
es_en_c = np.concatenate([sentence_es_en for _ in range(min(sentence_en.shape[0],sentence_es.shape[0])//sentence_es_en.shape[0] -1)], axis=0)
print(es_en_c.shape)
n = es_en_c.shape[0]

(1887216, 32)


## Model Defination

In [19]:
# input_ = Input(shape=(None, ))
# embedding = Embedding(numwords, 100, input_length=None)
# embedded = embedding(input_)

# decoder_lstm = LSTM(100, return_sequences=True)
# h = decoder_lstm(embedded)

# fromhidden = Dense(numwords, activation='linear')
# out = TimeDistributed(fromhidden)(h)

# model = Model(input_, out)

# opt = keras.optimizers.Adam()
# lss = sparse_loss

# model.compile(opt, lss)
# model.summary()

In [27]:
class GiretTwoCell(keras.layers.Layer):

    def __init__(self, cell_1 , cell_2 , nHidden , **kwargs):
        self.cell_1 = cell_1
        self.cell_2 = cell_2
        self.nHidden = nHidden
        self.state_size = [nHidden,nHidden]
        super(GiretTwoCell, self).__init__(**kwargs)

    def build(self, input_shape):
        
        nHidden = self.nHidden
        
        input_shape_n = ( input_shape[0] , input_shape[1]- 2 )
#         print "pp", input_shape_n
        
#         self.cell_1.build(input_shape_n)
#         self.cell_2.build(input_shape_n)
        
        self._trainable_weights += ( self.cell_1.trainable_weights )
        self._trainable_weights += ( self.cell_2.trainable_weights )
        
        self._non_trainable_weights += (  self.cell_1.non_trainable_weights )
        self._non_trainable_weights += (  self.cell_2.non_trainable_weights )
        
        self.built = True

    def call(self, inputs, states):
        
        nHidden = self.nHidden
        
        gate_val_1 = inputs[ : , 0:1]
        gate_val_2 = inputs[ : , 1:2]
        
        inputs  = inputs[ : , 2: ]
                
        gate_val_1 = K.repeat_elements(gate_val_1 , nHidden , -1 ) # shape # bs , hidden
        gate_val_2 = K.repeat_elements(gate_val_2 , nHidden , -1 ) # shape # bs , hidden
        
        _ , [h1 , c1 ]  = self.cell_1.call( inputs , states )
        _ , [h2 , c2 ]  = self.cell_2.call( inputs , states )
        
        h = gate_val_1*h1 + gate_val_2*h2  + (1 - gate_val_1 -  gate_val_2 )*states[0]
        c = gate_val_1*c1 + gate_val_2*c2  + (1 - gate_val_1 -  gate_val_2 )*states[1]
        
        return h, [h , c ]

In [28]:
hidden = 256
numwords = len(word2id)
hidden_emd_dim = 300
embed = Embedding(numwords,
                     hidden_emd_dim,
                     weights=[embedding_matrix],trainable=True)

rnn_en = LSTM(hidden, return_sequences=True, name='en_lstm', recurrent_dropout=0.3, dropout=0.3)
rnn_hi = LSTM(hidden , return_sequences=True, name='es_lstm', recurrent_dropout=0.3, dropout=0.3)

       
# en
inp_en = Input((None, ))
x = embed(inp_en)
x = rnn_en(x)
out_en = TimeDistributed(Dense(numwords, activation='linear'), name='en')(x)


# es
inp_hi = Input((None, ))
x = embed(inp_hi)
x = rnn_hi( x )
out_hi = TimeDistributed(Dense(numwords, activation='linear'), name='es')(x)


cell_combined = GiretTwoCell(rnn_hi.cell , rnn_en.cell , hidden)

        
inp_enhi = Input((None, ))
x = embed(inp_enhi )

x_att = x
x_att = Bidirectional(LSTM(32 , return_sequences=True, recurrent_dropout=0.3, dropout=0.3))( x )
bider_h = x_att 
x_att = TimeDistributed(Dense(3, activation='softmax') )(x_att)
x_att = Lambda(lambda x : x[... , 1: ])(x_att)

x = Concatenate(-1)([x_att , x ])

x =  RNN(cell_combined , return_sequences=True)(x)
out_enhi = TimeDistributed(Dense(numwords , activation='linear'), name='en_es')(x)
        
model = Model( [inp_hi , inp_en , inp_enhi  ] , [ out_hi , out_en , out_enhi ] ) 

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [29]:
opt = keras.optimizers.Adam()
lss = sparse_loss

model.compile(loss=sparse_loss, optimizer=opt)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    48548700    input_1[0][0]                    
                                                                 input_2[0][0]                    
          

In [23]:
tb = keras.callbacks.TensorBoard(log_dir='./logsMUSE2', histogram_freq=0, batch_size=8, write_graph=True, write_grads=True, write_images=True, update_freq='batch')

In [30]:
model.load_weights(folder_path+"weights")

In [25]:
def batch_generator(batch_size=8):
    b = batch_size
    while True:
        n1 = np.random.randint(0, sentence_en.shape[0] - batch_size, batch_size)
        n2 = np.random.randint(0, sentence_es.shape[0] - batch_size, batch_size)
    
        es_en_temp = None
        if random.random() <= 0.6:
            es_en_temp = sentence_es_en
        elif random.random() <= 0.5:
            es_en_temp = sentence_en
        else:
            es_en_temp = sentence_es
            
        n3 = np.random.randint(0, es_en_temp.shape[0] - batch_size, batch_size)
            
    
        x = [sentence_en[n1,:-1], sentence_es[n2,:-1], es_en_temp[n3,:-1]]
        y = [sentence_en[n1,1:], sentence_es[n2,1:], es_en_temp[n3,1:]]
        
        yield x, y

In [26]:
train_gen = batch_generator(8)
model.fit_generator(
    generator=train_gen,
    steps_per_epoch=sentence_es_en.shape[0] // 8, 
    epochs=20, 
    callbacks=[tb], 
    initial_epoch=15
)

# model.fit( 
#     [sentence_en[:n,:-1], sentence_en[:n,:-1], es_en_c[:n,:-1]],
#     [sentence_en[0:n,1:], sentence_es[:n,1:], es_en_c[:n,1:]], 
#     batch_size=8, 
#     epochs=1, 
#     callbacks=[tb],
#     shuffle=True
# )
# model.fit(
#     [x_en[:n,:-1], x_es[:n,:-1]],
#     [x_en[0:n,1:], x_es[:n,1:]]1, 
#     batch_size=16, 
#     epochs=1, 
#     validation_split=0.1,
#     callbacks=[tb],
#     shuffle=True
# )

Instructions for updating:
Use tf.cast instead.
Epoch 16/20
Epoch 17/20
Epoch 18/20

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fed9420e4a8>

In [27]:
model.save_weights(folder_path+"weights")
with open(folder_path+"model.json",'w') as f:
    f.write(str(model.to_json()))

# Evaluation

- a general statistics

## SentEval
https://github.com/facebookresearch/SentEval

In [44]:
# ! pip3 install torchvision
# ! pip3 install sklearn

In [31]:
def each(sentence, max_length=32, tokeniser=text_to_word_sequence):
#     sentence =  tokeniser(sentence)
    new_sent = [word2id['<sos>']] 
    for word in sentence:
        try:
            new_sent.append(word2id[word])
        except:
            new_sent.append(word2id['<unk>'])
    
    new_sent.append(word2id['<eos>']) 
    if len(new_sent) > max_length:
        new_sent = new_sent[:max_length]
    else:
        while len(new_sent) < max_length:
            new_sent.append(word2id['<pad>'])
    return new_sent

In [32]:
sent_embedding_model = Model(inputs=model.input, outputs=model.get_layer('rnn_1').output)

In [33]:
def get_sent_embeddings(sents):
    sents = [each(sent) for sent in sents] 
    return np.mean(sent_embedding_model.predict([sents,sents,sents], batch_size=64),axis=1)

In [34]:
#Set PATHs
PATH_TO_SENTEVAL = './SentEval/'
PATH_TO_DATA = './SentEval/data/'

# import SentEval
import sys
sys.path.insert(0, PATH_TO_SENTEVAL)
import senteval

In [35]:
# SentEval prepare and batcher
def prepare(params, samples):
#     print(samples[0])
    return

def batcher(params, batch):
    return get_sent_embeddings(batch)

In [36]:
# Set params for SentEval
params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': False}
params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 32,
                                 'tenacity': 3, 'epoch_size': 4}

In [37]:
se = senteval.engine.SE(params_senteval, batcher, prepare)

In [38]:
# transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
#                       'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
#                       'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
#                       'Length', 'WordContent', 'Depth', 'TopConstituents',
#                       'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
#                       'OddManOut', 'CoordinationInversion']

In [43]:
transfer_tasks = ['TREC']

In [44]:
results = se.eval(transfer_tasks)
results





{'TREC': {'acc': 67.8, 'devacc': 58.73, 'ndev': 5452, 'ntest': 500}}

In [42]:
results

{'MR': {'acc': 65.61, 'devacc': 65.59, 'ndev': 10662, 'ntest': 10662},
 'SICKEntailment': {'acc': 72.74, 'devacc': 73.8, 'ndev': 500, 'ntest': 4927},
 'SICKRelatedness': {'devpearson': 0.6288833057276259,
  'mse': 0.6256310523376608,
  'ndev': 500,
  'ntest': 4927,
  'pearson': 0.6283092600845445,
  'spearman': 0.5837184862936838,
  'yhat': array([3.67965652, 3.8552741 , 1.12607261, ..., 3.38653953, 3.89661515,
         4.70715554])},
 'STS12': {'MSRpar': {'nsamples': 750,
   'pearson': (0.14258749810529842, 8.912221970582458e-05),
   'spearman': SpearmanrResult(correlation=0.15110999162593566, pvalue=3.248632461976632e-05)},
  'MSRvid': {'nsamples': 750,
   'pearson': (0.4007830218587774, 2.6189184388944674e-30),
   'spearman': SpearmanrResult(correlation=0.42664203634904674, pvalue=1.5750695641919273e-34)},
  'SMTeuroparl': {'nsamples': 459,
   'pearson': (0.3271387330066187, 6.558578218961659e-13),
   'spearman': SpearmanrResult(correlation=0.44585313264930493, pvalue=8.423250166894

```
{'MR': {'acc': 65.61, 'devacc': 65.59, 'ndev': 10662, 'ntest': 10662},
 'SICKEntailment': {'acc': 72.74, 'devacc': 73.8, 'ndev': 500, 'ntest': 4927},
 'SICKRelatedness': {'devpearson': 0.6288833057276259,
  'mse': 0.6256310523376608,
  'ndev': 500,
  'ntest': 4927,
  'pearson': 0.6283092600845445,
  'spearman': 0.5837184862936838,
  'yhat': array([3.67965652, 3.8552741 , 1.12607261, ..., 3.38653953, 3.89661515,
         4.70715554])},
 'STS12': {'MSRpar': {'nsamples': 750,
   'pearson': (0.14258749810529842, 8.912221970582458e-05),
   'spearman': SpearmanrResult(correlation=0.15110999162593566, pvalue=3.248632461976632e-05)},
  'MSRvid': {'nsamples': 750,
   'pearson': (0.4007830218587774, 2.6189184388944674e-30),
   'spearman': SpearmanrResult(correlation=0.42664203634904674, pvalue=1.5750695641919273e-34)},
  'SMTeuroparl': {'nsamples': 459,
   'pearson': (0.3271387330066187, 6.558578218961659e-13),
   'spearman': SpearmanrResult(correlation=0.44585313264930493, pvalue=8.423250166894389e-24)},
  'all': {'pearson': {'mean': 0.2958716812971679, 'wmean': 0.3028944887220307},
   'spearman': {'mean': 0.3631602533857541, 'wmean': 0.36154137754353355}},
  'surprise.OnWN': {'nsamples': 750,
   'pearson': (0.4010845876707513, 2.349634411380406e-30),
   'spearman': SpearmanrResult(correlation=0.483258535961669, pvalue=3.785359493165831e-45)},
  'surprise.SMTnews': {'nsamples': 399,
   'pearson': (0.2077645658443937, 2.8793575626173877e-05),
   'spearman': SpearmanrResult(correlation=0.30893757034281394, pvalue=2.852474352467182e-10)}},
 'STS13': {'FNWN': {'nsamples': 189,
   'pearson': (0.1796998687694542, 0.013351592089388827),
   'spearman': SpearmanrResult(correlation=0.17312087397497752, pvalue=0.0172074164275323)},
  'OnWN': {'nsamples': 561,
   'pearson': (-0.036391665423154564, 0.38961542391537196),
   'spearman': SpearmanrResult(correlation=-0.0064082359237449225, pvalue=0.8796247798086678)},
  'all': {'pearson': {'mean': 0.1506315417021541,
    'wmean': 0.16332491147677275},
   'spearman': {'mean': 0.15573074065761325, 'wmean': 0.1696563418461701}},
  'headlines': {'nsamples': 750,
   'pearson': (0.30858642176016265, 5.202433717123146e-18),
   'spearman': SpearmanrResult(correlation=0.30047958392160706, pvalue=4.087419263459101e-17)}},
 'STS14': {'OnWN': {'nsamples': 750,
   'pearson': (0.13495201475831842, 0.00021004588531773239),
   'spearman': SpearmanrResult(correlation=0.19518669302333042, pvalue=7.10871057435418e-08)},
  'all': {'pearson': {'mean': 0.23573767456033798,
    'wmean': 0.2615492418576007},
   'spearman': {'mean': 0.2741601729177288, 'wmean': 0.292553611332301}},
  'deft-forum': {'nsamples': 450,
   'pearson': (0.156784942801156, 0.0008458976964767222),
   'spearman': SpearmanrResult(correlation=0.18466330729597857, pvalue=8.136498539192537e-05)},
  'deft-news': {'nsamples': 300,
   'pearson': (0.07327643492260316, 0.20566038816830512),
   'spearman': SpearmanrResult(correlation=0.18054609654412698, pvalue=0.0016898349432055404)},
  'headlines': {'nsamples': 750,
   'pearson': (0.26463587403452904, 1.7451281547738147e-13),
   'spearman': SpearmanrResult(correlation=0.245732305648832, pvalue=8.895765565765384e-12)},
  'images': {'nsamples': 750,
   'pearson': (0.4466387586578476, 4.70476518566636e-38),
   'spearman': SpearmanrResult(correlation=0.46627739473892865, pvalue=9.440809367485879e-42)},
  'tweet-news': {'nsamples': 750,
   'pearson': (0.33813802218757344, 1.6208217071906041e-21),
   'spearman': SpearmanrResult(correlation=0.3725552402551761, pvalue=4.164134071189017e-26)}},
 'STS15': {'all': {'pearson': {'mean': 0.2724451880320972,
    'wmean': 0.31685872675093085},
   'spearman': {'mean': 0.2882838350353617, 'wmean': 0.3327150079550635}},
  'answers-forums': {'nsamples': 375,
   'pearson': (0.039706422734896954, 0.4432914403188235),
   'spearman': SpearmanrResult(correlation=0.04068902993621121, pvalue=0.43208160190150313)},
  'answers-students': {'nsamples': 750,
   'pearson': (0.28705987824827633, 1.0775132175080352e-15),
   'spearman': SpearmanrResult(correlation=0.30579921985230657, pvalue=1.0645513662148402e-17)},
  'belief': {'nsamples': 375,
   'pearson': (0.14987564357862884, 0.0036248415408096798),
   'spearman': SpearmanrResult(correlation=0.1804292567768978, pvalue=0.0004461019865200121)},
  'headlines': {'nsamples': 750,
   'pearson': (0.3818721603396246, 1.8961210851164007e-27),
   'spearman': SpearmanrResult(correlation=0.3751911032601324, pvalue=1.7553466470148097e-26)},
  'images': {'nsamples': 750,
   'pearson': (0.5037118352590594, 1.6789112333633647e-49),
   'spearman': SpearmanrResult(correlation=0.5393105653512604, pvalue=8.059509148311837e-58)}},
 'STS16': {'all': {'pearson': {'mean': 0.3037491915973627,
    'wmean': 0.31286456501095683},
   'spearman': {'mean': 0.3748289458274702, 'wmean': 0.3827874432860581}},
  'answer-answer': {'nsamples': 254,
   'pearson': (0.33687184761736494, 3.7108326715175343e-08),
   'spearman': SpearmanrResult(correlation=0.3881588272050183, pvalue=1.4670713101796024e-10)},
  'headlines': {'nsamples': 249,
   'pearson': (0.37639387167679833, 8.426999310924058e-10),
   'spearman': SpearmanrResult(correlation=0.3761729728472852, pvalue=8.635789491144841e-10)},
  'plagiarism': {'nsamples': 230,
   'pearson': (0.32884426285695534, 3.3498363949391107e-07),
   'spearman': SpearmanrResult(correlation=0.47740807184303585, pvalue=1.702272150989612e-14)},
  'postediting': {'nsamples': 244,
   'pearson': (0.4719643243971815, 6.13050772417937e-15),
   'spearman': SpearmanrResult(correlation=0.5642863914496445, pvalue=6.455918799404908e-22)},
  'question-question': {'nsamples': 209,
   'pearson': (0.00467165143851349, 0.9464761414626301),
   'spearman': SpearmanrResult(correlation=0.06811846579236705, pvalue=0.3270811925573087)}},
 'STSBenchmark': {'devpearson': 0.5512091403813993,
  'mse': 2.0149316274007862,
  'ndev': 1500,
  'ntest': 1379,
  'pearson': 0.49412586914630235,
  'spearman': 0.47672368959625055,
  'yhat': array([2.34865983, 2.34609645, 2.41522208, ..., 3.80854183, 3.47457961,
         3.54439748])}}
```

## MultiNLI
https://www.kaggle.com/takahirokubo0/multinli-dataset-analysis

More code mixed: http://mt-archive.info/EMNLP-2008-Solorio.pdf
Miami Dataset
http://bangortalk.org.uk/chats/
https://github.com/SeedlingsBabylab/parse_clan2
https://github.com/SeedlingsBabylab/clancomments

## Text Generation

In [62]:
def sample_logits(preds, temperature=1.0):
    """
    Sample an index from a logit vector.

    :param preds:
    :param temperature:
    :return:
    """
    preds = np.asarray(preds).astype('float64')

    if temperature == 0.0:
        return np.argmax(preds)

    preds = preds / temperature
    preds = preds - logsumexp(preds)

    choice = np.random.choice(len(preds), 1, p=np.exp(preds))

    return choice

In [63]:
def generate_seq(model : Model, seed, size, out_num=3, temperature=1.0):
    """
    :param model: The complete RNN language model
    :param seed: The first few wordas of the sequence to start generating from
    :param size: The total size of the sequence to generate
    :param temperature: This controls how much we follow the probabilities provided by the network. For t=1.0 we just
        sample directly according to the probabilities. Lower temperatures make the high-probability words more likely
        (providing more likely, but slightly boring sentences) and higher temperatures make the lower probabilities more
        likely (resulting is weirder sentences). For temperature=0.0, the generation is _greedy_, i.e. the word with the
        highest probability is always chosen.
    :return: A list of integers representing a samples sentence
    """

    ls = seed.shape[0]

    # Due to the way Keras RNNs work, we feed the model a complete sequence each time. At first it's just the seed,
    # zero-padded to the right length. With each iteration we sample and set the next character.
    
    # tokens = np.concatenate([seed, np.zeros(size - ls)])
    tokens_all = []
    for i in range(out_num):
        tokens_all.append(np.concatenate([seed, np.zeros(size - ls)]))

    for i in range(ls, size):
        
        tokens_to_predict = []
        for j in range(out_num):
            tokens_to_predict.append(tokens_all[j][None,:])
        
        all_probs = model.predict(tokens_to_predict)

        # Extract the i-th probability vector and sample an index from it
        for j, probs in enumerate(all_probs):
            next_token = util.sample_logits(probs[0, i-1, :], temperature=temperature)
            tokens_all[j][i] = next_token

    return [tokens.astype('int') for tokens in tokens_all]

In [64]:
seed = sentence_es[600][:1]
# seed = np.insert(seed, 0, 2)
a = generate_seq(model, seed, 25, out_num=3, temperature=0.9)

In [65]:
print(decode(a[0]))
print()
print(decode(a[1]))
print()
print(decode(a[2]))

<sos> i welcome the rapporteur <unk> d <unk> <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

<sos> por lo tanto sólo me gustaría entender que de esta razón se trata de un tema que cabe decir a los problemas medioambientales en

<sos> now that unfortunately has buy up using drugs as many nature many rights including anti self competition could help simplify imports against each state


# Notes

## Data Pre-processing Note

### Genism
https://www.kdnuggets.com/2017/11/building-wikipedia-text-corpus-nlp.html uses [WikiCorpus of gensim](https://radimrehurek.com/gensim/corpora/wikicorpus.html). I wasn't able to find a way to preseve line ending and that sucks.

### wikiextractor
Using a modified version of https://tiefenauer.github.io/blog/wiki-n-gram-lm/. Uses https://github.com/attardi/wikiextractor in its first step followed by a bash and weird script.
The script being

```
result=$(find ./cleaned_wiki/ -name '*bz2' -exec bzcat {} \+ \
        | pv \
        | tee >(    sed 's/<[^>]*>//g' \
                  | sed 's|["'\''„“‚‘]||g' \
                  | python3 ./wiki_cleaner2.py es >> wiki_es2.txt \
               ) \
        | grep -e "<doc" \
        | wc -l);

```

## news

cat news.es.all | ../normalize-punctuation.perl -l es | ../scripts/tokenizer.perl -l es -no-escape -threads 40 > new.es.all.tok
cat news*es.shuffled > news.es.all 


## Making Vocab

```
from gensim.corpora import WikiCorpus
wiki = WikiCorpus('datasets/wiki_dataset/raw/eswiki-latest-pages-articles-multistream.xml.bz2')
from gensim.corpora import MmCorpus
MmCorpus.serialize('datasets/wiki_dataset/wiki_es.mm', wiki)
wiki.dictionary.save('datasets/wiki_dataset/wiki_es.vocab')
```

In [235]:
def generate_seq(model : Model, seed, size, temperature=1.0):
    """
    :param model: The complete RNN language model
    :param seed: The first few wordas of the sequence to start generating from
    :param size: The total size of the sequence to generate
    :param temperature: This controls how much we follow the probabilities provided by the network. For t=1.0 we just
        sample directly according to the probabilities. Lower temperatures make the high-probability words more likely
        (providing more likely, but slightly boring sentences) and higher temperatures make the lower probabilities more
        likely (resulting is weirder sentences). For temperature=0.0, the generation is _greedy_, i.e. the word with the
        highest probability is always chosen.
    :return: A list of integers representing a samples sentence
    """

    ls = seed.shape[0]

    # Due to the way Keras RNNs work, we feed the model a complete sequence each time. At first it's just the seed,
    # zero-padded to the right length. With each iteration we sample and set the next character.

    tokens = np.concatenate([seed, np.zeros(size - ls)])

    for i in range(ls, size):

        probs = model.predict([tokens[None,:],tokens[None,:]])

        # Extract the i-th probability vector and sample an index from it
        next_token = util.sample_logits(probs[0][0, i-1, :], temperature=temperature)

        tokens[i] = next_token

    return [int(t) for t in tokens]

In [236]:
gen = generate_seq(model, x_en[21][:10], 60, temperature=0.9)

In [237]:
decode(gen)

'<UNK> southern portion <UNK> <UNK> county has <UNK> strong history <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> times <UNK> any or birds lakers transposition <UNK> <UNK> gerolamo <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> his <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> detected games protection <UNK> music <UNK> <UNK> an <UNK> num <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'