## Making Vocab

In [1]:
# installation
! pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/96/2f/168da118beb6eef637e5f5af955a017a0bf83cff496832aa5a6b24bb01c5/sentencepiece-0.1.81-cp35-cp35m-manylinux1_x86_64.whl (1.0MB)
[K    100% |████████████████████████████████| 1.0MB 1.9MB/s eta 0:00:01
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.81


In [1]:
import sentencepiece as spm

### Vocab Size choice

- BERT uses a vocab of 30k wordpieces for English and 110k for 102 language model
- FB models use a vocab size of 40k-50k BPE (FB paper: https://arxiv.org/pdf/1811.01136.pdf) 
- From sentence piece repo, I'm gonna use: https://github.com/google/sentencepiece/blob/master/doc/experiments.md, 8k to accomodate for two differnet languages

In [None]:
# train
# spm.SentencePieceTrainer.Train(
#     '--input=./datasets/wmt11/en_es.txt \
#     --model_prefix=en_es_txt_word_piece \
#      --bos_id=2 --eos_id=3 --unk_id=4 --pad_id=0 \
#     --vocab_size=8000')

spm.SentencePieceTrainer.Train(
    '--input=./datasets/wiki_dataset/en_es.txt \
    --model_prefix=wiki_en_es_txt_word_piece \
     --bos_id=2 --eos_id=3 --unk_id=4 --pad_id=0 \
    --vocab_size=20000')

In [None]:
# load
sp = spm.SentencePieceProcessor()
sp.Load("wiki_en_es_txt_word_piece.model")
sp.SetEncodeExtraOptions("bos:eos")

In [None]:
# for deatils take a look at: https://github.com/google/sentencepiece/blob/master/python/README.md
print(sp.EncodeAsPieces("This is a test, what?.... lolol lolol"))
print(sp.EncodeAsIds("This is a test, what?.... lolol lolol"))

## Preprocessing File

In [4]:
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import multiprocessing

Using TensorFlow backend.


In [5]:
max_len = 32

In [6]:
def tokenise(sent): 
    return sp.EncodeAsIds(sent)

In [7]:
def process_file(file_name, maxlen, tokenise=tokenise, pool=None):
    with open(file_name) as f:
        sentences = f.read().split("\n")
    print("no of sentencecs: ", len(sentences))
    answer = None
    if pool is None:
        pool = multiprocessing.Pool(processes=40)
        answer = pool.map(tokenise, (sentence for sentence in sentences))
        pool.close()
    else:
        answer = pool.map(tokenise, (sentence for sentence in sentences))
    answer = pad_sequences(answer, maxlen=maxlen, dtype='int32', padding='post', truncating='post', value=0)
    return answer

In [8]:
pool = multiprocessing.Pool(processes=40)
sentence_en = process_file('./datasets/wmt11/training-monolingual/europarl-v6.en', max_len, pool=pool)
sentence_es = process_file('./datasets/wmt11/training-monolingual/europarl-v6.es', max_len, pool=pool)
sentence_es_en = process_file('./datasets/wmt11/code_mixed_es_en.txt.tok', max_len, pool=pool)
pool.close()
del pool

no of sentencecs:  2015441
no of sentencecs:  1927758
no of sentencecs:  39317


In [5]:
## Model Language

In [None]:
from keras.layers import *
from keras.models import Model
from keras import backend as K

In [None]:
hidden = 128
numwords = len(sp)
hidden_emd_dim = 100

In [None]:
def sparse_loss(y_true, y_pred):
    return K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

class GiretTwoCell(keras.layers.Layer):

    def __init__(self, cell_1 , cell_2 , nHidden , **kwargs):
        self.cell_1 = cell_1
        self.cell_2 = cell_2
        self.nHidden = nHidden
        self.state_size = [nHidden,nHidden]
        super(GiretTwoCell, self).__init__(**kwargs)

    def build(self, input_shape):
        
        nHidden = self.nHidden
        
        input_shape_n = ( input_shape[0] , input_shape[1]- 2 )
#         print "pp", input_shape_n
        
#         self.cell_1.build(input_shape_n)
#         self.cell_2.build(input_shape_n)
        
        self._trainable_weights += ( self.cell_1.trainable_weights )
        self._trainable_weights += ( self.cell_2.trainable_weights )
        
        self._non_trainable_weights += (  self.cell_1.non_trainable_weights )
        self._non_trainable_weights += (  self.cell_2.non_trainable_weights )
        
        self.built = True

    def call(self, inputs, states):
        
        nHidden = self.nHidden
        
        gate_val_1 = inputs[ : , 0:1]
        gate_val_2 = inputs[ : , 1:2]
        
        inputs  = inputs[ : , 2: ]
                
        gate_val_1 = K.repeat_elements(gate_val_1 , nHidden , -1 ) # shape # bs , hidden
        gate_val_2 = K.repeat_elements(gate_val_2 , nHidden , -1 ) # shape # bs , hidden
        
        _ , [h1 , c1 ]  = self.cell_1.call( inputs , states )
        _ , [h2 , c2 ]  = self.cell_2.call( inputs , states )
        
        h = gate_val_1*h1 + gate_val_2*h2  + (1 - gate_val_1 -  gate_val_2 )*states[0]
        c = gate_val_1*c1 + gate_val_2*c2  + (1 - gate_val_1 -  gate_val_2 )*states[1]
        
        return h, [h , c ]

In [None]:
embed = Embedding(numwords, hidden_emd_dim)

rnn_en = LSTM(hidden, return_sequences=True)
rnn_hi = LSTM(hidden , return_sequences=True)

       
# en
inp_en = Input((None, ))
x = embed(inp_en)
x = rnn_en(x)
out_en = TimeDistributed(Dense(numwords, activation='linear'), name='en')(x)


# es
inp_hi = Input((None, ))
x = embed(inp_hi)
x = rnn_hi( x )
out_hi = TimeDistributed(Dense(numwords, activation='linear'), name='es')(x)


cell_combined = GiretTwoCell(rnn_hi.cell , rnn_en.cell , hidden)

        
inp_enhi = Input((None, ))
x = embed(inp_enhi )

x_att = x
x_att = Bidirectional(LSTM(32 , return_sequences=True))( x )
bider_h = x_att 
x_att = TimeDistributed(Dense(3, activation='softmax') )(x_att)
x_att = Lambda(lambda x : x[... , 1: ])(x_att)

x = Concatenate(-1)([x_att , x ])

x =  RNN(cell_combined , return_sequences=True)(x)
out_enhi = TimeDistributed(Dense(numwords , activation='linear'), name='en_es')(x)
        
model = Model( [inp_hi , inp_en , inp_enhi  ] , [ out_hi , out_en , out_enhi ] ) 

In [None]:
opt = keras.optimizers.Adam(lr=0.01, clipvalue=0.4)
lss = sparse_loss

model.compile(loss=sparse_loss, optimizer=opt)
model.summary()

In [None]:
model.fit( 
    [sentence_en[:n,:-1], sentence_en[:n,:-1], es_en_c[:n,:-1]],
    [sentence_en[0:n,1:], sentence_es[:n,1:], es_en_c[:n,1:]], 
    batch_size=8, 
    epochs=1, 
    validation_split=0.1,
    callbacks=[tb],
    shuffle=True
)

In [6]:
## Text Generations

In [None]:
def sample_logits(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')

    if temperature == 0.0:
        return np.argmax(preds)

    preds = preds / temperature
    preds = preds - logsumexp(preds)

    choice = np.random.choice(len(preds), 1, p=np.exp(preds))

    return choice
def generate_seq(model : Model, seed, size, out_num=3, temperature=1.0):

    ls = seed.shape[0]

    # Due to the way Keras RNNs work, we feed the model a complete sequence each time. At first it's just the seed,
    # zero-padded to the right length. With each iteration we sample and set the next character.
    
    # tokens = np.concatenate([seed, np.zeros(size - ls)])
    tokens_all = []
    for i in range(out_num):
        tokens_all.append(np.concatenate([seed, np.zeros(size - ls)]))

    for i in range(ls, size):
        
        tokens_to_predict = []
        for j in range(out_num):
            tokens_to_predict.append(tokens_all[j][None,:])
        
        all_probs = model.predict(tokens_to_predict)

        # Extract the i-th probability vector and sample an index from it
        for j, probs in enumerate(all_probs):
            next_token = util.sample_logits(probs[0, i-1, :], temperature=temperature)
            tokens_all[j][i] = next_token

    return [tokens.astype('int') for tokens in tokens_all]

In [None]:
seed = sentence_en[122][:4]
a = generate_seq(model, seed, 50, out_num=3, temperature=0.9)

In [5]:
# ! pip install pandas
import pandas as pd
from keras.utils import to_categorical

In [9]:
class2id = {'contradiction':0, 'entailment':1, 'neutral':2}
df = pd.read_json("./MultiNLO/XNLI-1.0/xnli.dev.jsonl", lines=True)

In [10]:
x = [df['sentence1'].to_numpy(), df['sentence2'].to_numpy()]
y = df['gold_label'].to_numpy()
for i in range(len(y)):
    y[i] = class2id[y[i]]
y = to_categorical(y, num_classes=3, dtype='int8')

In [13]:
from keras.layers import *
from keras.models import Sequential, Model

In [21]:
model = Sequential()
model.add(Dense(3, input_shape=(256*2,), activation='softmax'))
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')