In [1]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
tf.enable_eager_execution()

In [2]:
import numpy as np

In [3]:
tf.set_random_seed(27)
DEBUG_MODE = True

In [4]:
TRAIN_FILE = "./gensen/data/corpora/allnli.train.txt.clean.noblank"
EN_FILE = "./english.tok"
DE_FILE = "./german.tok"

#Const. Parsing FILES
CPT_X_FILE = "./tree_data/en.txt.tok.out"
CPT_Y_FILE  = "./tree_data/pt.out"

#Vocab Files
COMMON_VOCAB_FILE = "./words.txt"
GERMAN_VOCAB_FILE = "./german_words.txt"
TREE_VOCAB_FILE = "./tree_words.txt"

In [2]:
import Pipeline as Pipeline

ModuleNotFoundError: No module named 'Pipeline'

In [6]:
import time
import os
logging = tf.logging
logging.set_verbosity(logging.INFO)

def log_msg(msg):
    logging.info( '{}: {}'.format(time.ctime(),msg ) )
    

In [7]:
class Decoder(tf.keras.Model):
    def __init__(self,V):
        super(Decoder, self).__init__()
        self.cell = tf.nn.rnn_cell.GRUCell(num_units=512)
        self.dense_layer = tf.keras.layers.Dense(V, activation=None)
        
    def frwrd(self, context_vector,max_len):
        
        state = self.cell.zero_state(batch_size=int(context_vector.shape[0]), dtype=tf.float32)
        predictions = []
        
        for i in range(max_len): 
            output, state = self.cell(context_vector,state)
            predictions.append(output)
            
        outputs = tf.stack(predictions,axis=1)
        outputs = self.dense_layer(outputs)
        
        return outputs 

In [8]:
class RNN(tf.keras.Model):
    def __init__(self,V):
        super(RNN, self).__init__()
        self.EMBED_DIM = 256
        self.W = tfe.Variable(tf.random_uniform(minval=-1.0, maxval=1.0, shape=[V, self.EMBED_DIM]))
        self.cell = tf.nn.rnn_cell.GRUCell(num_units=512)
        #self.cell = tf.contrib.cudnn_rnn.CudnnGRU(num_layers=1, num_units=512)
        
    def frwrd(self, X, num_words,train=True,dropout=0.3):
        u = tf.nn.embedding_lookup(self.W, X)
        u_time = tf.unstack(u, axis=1)
        #define layer
        outputs, final_state = tf.nn.static_rnn(cell=self.cell, inputs=u_time, sequence_length=num_words, dtype=tf.float32)
        
        if train:
            final_state = tf.nn.dropout(final_state, keep_prob=1-dropout)
            
        return final_state #tf.stack(outputs,axis=-1)

In [9]:
class MLP(tf.keras.Model):
    def __init__(self):
        super(MLP, self).__init__()
        self.NUM_CLASSES = 4
        self.hidden_layer = tf.keras.layers.Dense(256, activation=tf.nn.relu)
        self.output_layer = tf.keras.layers.Dense(4, activation=None)
        
    def frwrd(self,X):
        hidden_output = self.hidden_layer(X)
        output = self.output_layer(hidden_output)
        return output

In [10]:
class Model(tf.keras.Model):
    
    def __init__(self, V,VG ,VT):
        """
        Accepts vocab size 
        V - English Vocab Size (30002) 
        VG - German Vocab Size (30002)
        VT - Tree Vocab Size (76)
        """
        super(Model, self).__init__()
        self.rnn = RNN(V)
        self.MLP = MLP()
        self.decoder = Decoder(VG)
        self.tree_decoder = Decoder(VT)
        
    def frwrd_pass_nli(self, X_Hyp,L_H, X_prem,L_P,train,dropout):
        u = self.rnn.frwrd(X_Hyp,L_H,train,dropout)
        v = self.rnn.frwrd(X_prem,L_P,train,dropout)
        inp = tf.concat( [u,v,tf.abs(u-v), u*v], axis = 1 )
        output = self.MLP.frwrd(inp)
        return output
    
    def frwrd_pass_nmt(self, X_En,L_En,maxLen,train,dropout):
        context_vector = self.rnn.frwrd(X_En,L_En,train,dropout) #final state from RNN
        dec = self.decoder.frwrd(context_vector,maxLen) #maxLen of the the batch going to decoder, from German Dataset
        return dec
    
    def frwrd_pass_cpt(self, X_En,L_En,maxLen,train,dropout):
        context_vector = self.rnn.frwrd(X_En,L_En,train,dropout) #final state from RNN
        dec = self.tree_decoder.frwrd(context_vector,maxLen) #maxLen of the the batch going to decoder, from German Dataset
        return dec

In [11]:
model = Model(30002,30002,76) 

In [12]:
def loss_nli(predicted_y, desired_y):
    return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=predicted_y, labels=desired_y) )

In [13]:
def loss_nmt(predicted_y,desired_y,desired_y_len,max_len):
    desired_y = tf.one_hot(desired_y,depth= predicted_y.shape[-1])
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=predicted_y,labels=desired_y)
    mask = tf.sequence_mask(desired_y_len,max_len, dtype=tf.float32)
    loss = loss * mask
    loss = tf.reduce_sum(loss) / tf.cast(tf.reduce_sum(desired_y_len), dtype=tf.float32)
    return loss

In [14]:
def train_nli(epochs,restore=False,dropout=0.3):
    
    #Load the dataset
    dataset  = Pipeline.load_NLI_dataset(TRAIN_FILE)
    
    optimizer = tf.train.AdamOptimizer(learning_rate = 0.002)
    checkpoint_directory = 'models_checkpoints/common/'
    checkpoint = tfe.Checkpoint(optimizer=optimizer,
                            model=model,
                            optimizer_step=tf.train.get_or_create_global_step())
    
     #Restore latest checkpoint 
    if(restore):
        checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
        return

        
    for i in range(epochs):
        cnt = 0
        batch_mean_loss = 0.0
        for x in iter(dataset):
            cnt += 1
        
            with tf.GradientTape() as tape:
                predicted = model.frwrd_pass_nli(x[0]['sentence'],x[0]['len'],x[1]['sentence'],x[1]['len'],True,dropout)
                desired = x[2]
                curr_loss = loss_nli(predicted, desired)
                batch_mean_loss += curr_loss
            
            grads = tape.gradient( curr_loss, model.variables[:-1] )
            optimizer.apply_gradients(zip(grads, model.variables[:-1]),
                                        global_step=tf.train.get_or_create_global_step())
            
            
            if(cnt%100==0):
                log_msg('Epoch {:d}: Batch Id {:d} NLI Batch Loss: {:.4f}'.format(i,cnt,batch_mean_loss/100.0))
                batch_mean_loss = 0.0
            
            if(DEBUG_MODE):
                if(cnt==400):
                    break
                    
        log_msg("NLI Training Completed for 1 epoch, Saving Final Checkpoint")            
        checkpoint.save(file_prefix=checkpoint_directory)
            


In [15]:
def train_nmt(epochs,restore=False,dropout=0.3):
    
    #Load the Dataset
    dataset  = Pipeline.load_NMT_dataset(EN_FILE,DE_FILE,batch_size=8)
    
    
    #Create Optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate = 0.002)
    
    
    #Init Check Points Directory
    checkpoint_directory = 'models_checkpoints/common/'
    checkpoint = tfe.Checkpoint(optimizer=optimizer,
                            model=model,
                            optimizer_step=tf.train.get_or_create_global_step())
    
    #Restore latest checkpoint 
    if(restore):
        checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))

    
    for i in range(epochs):
        
        cnt=0
        batch_mean_loss  = 0.0
        for x in iter(dataset):
            
            curr_loss = 0.0
            cnt += 1
            
            with tf.GradientTape() as tape:
                max_len_of_batch_german = x[1]['sentence'].shape[1]
                
                max_len_of_batch_english = x[0]['sentence'].shape[1]
                if(max_len_of_batch_english>200):
                    continue
                #print(max_len_of_batch_english)
                
                input_sentence_english,input_sentence_english_len  = x[0]['sentence'],x[0]['len']
                
                
                predicted = model.frwrd_pass_nmt(input_sentence_english,input_sentence_english_len,max_len_of_batch_german,True,dropout)
                #Convert Labels into One Hot
                max_batch_len_german = x[1]['sentence'].shape[1]
                german_sentence = x[1]['sentence']#tf.one_hot(x[1]['x'],depth=30002)
                german_sentence_len  = x[1]['len']
                #print(german)
                
                
                curr_loss = loss_nmt(predicted,german_sentence,german_sentence_len,max_batch_len_german)
                batch_mean_loss+= curr_loss
                #print(('Epoch %d : Batch Id %d Batch Loss: %.4f'%(i,batch_id,curr_loss)))
                if(cnt%100==0):
                    log_msg('Epoch {:d}: Batch Id {:d} NMT Batch Loss: {:.4f}'.format(i,cnt,batch_mean_loss/100.0))
                    checkpoint.save(file_prefix=checkpoint_directory)
                    batch_mean_loss = 0 
                    
                if(DEBUG_MODE):
                    if(cnt==400):
                        break
                        
                    
        
            grads = tape.gradient( curr_loss, model.variables )
            optimizer.apply_gradients(zip(grads, model.variables),
                                        global_step=tf.train.get_or_create_global_step())
         
    #Save the Final Checkpoint
    log_msg("NMT Training Completed, Saving Final Checkpoint")
    checkpoint.save(file_prefix=checkpoint_directory)

In [16]:
def train_cpt(epochs,restore=False,dropout=0.3):
    
    #Load the Dataset
    dataset  = Pipeline.load_tree_dataset(CPT_X_FILE,CPT_Y_FILE)
    
    
    #Create Optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate = 0.002)
    
    
    #Init Check Points Directory
    checkpoint_directory = 'models_checkpoints/common/'
    checkpoint = tfe.Checkpoint(optimizer=optimizer,
                            model=model,
                            optimizer_step=tf.train.get_or_create_global_step())
    
    #Restore latest checkpoint 
    if(restore):
        checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
        log_msg("Restored CPT Previous Checkpoint")
        

    for i in range(epochs):
        
        cnt=0
        batch_mean_loss  = 0.0
        for x in iter(dataset):
            curr_loss = 0.0
            cnt += 1
            
            with tf.GradientTape() as tape:
                max_len_of_batch = x[1]['sentence'].shape[1]
                input_sentence_english,input_sentence_english_len  = x[0]['sentence'],x[0]['len']
                
               
                predicted = model.frwrd_pass_cpt(input_sentence_english,input_sentence_english_len,max_len_of_batch,True,dropout)
                
                tree_sentence = x[1]['sentence']#tf.one_hot(x[1]['x'],depth=30002)
                tree_sentence_len  = x[1]['len']
                
                
                #Use the same LOSS FUNCTION as NMT
                curr_loss = loss_nmt(predicted,tree_sentence,tree_sentence_len,max_len_of_batch)
                batch_mean_loss+= curr_loss
               
                if(cnt%100==0):
                    log_msg('Epoch {:d}: Batch Id {:d} CPT Batch Loss: {:.4f}'.format(i,cnt,batch_mean_loss/100.0))
                    checkpoint.save(file_prefix=checkpoint_directory)
                    batch_mean_loss = 0.0
                
                if(DEBUG_MODE):
                    if(cnt==200):
                        break
        
            grads = tape.gradient(curr_loss, model.variables )
            optimizer.apply_gradients(zip(grads, model.variables),
                                        global_step=tf.train.get_or_create_global_step())
         
    #Save the Final Checkpoint
    log_msg("CPT Training Completed for 1 epoch, Saving Final Checkpoint")
    checkpoint.save(file_prefix=checkpoint_directory)

In [17]:
def train(epochs, model):
    if model == 'nli':
        train_nli(epochs,False,0.3)
    elif model == 'nmt':
        train_nmt(epochs,False,0.3)
    else:
        train_cpt(epochs,False,0.3)

In [18]:
log_msg("Training Started on CPT")
train(1,"cpt")
log_msg("Training Completed on CPT")

INFO:tensorflow:Thu Oct 11 16:28:31 2018: Training Started on CPT
INFO:tensorflow:Thu Oct 11 16:28:31 2018: Training Completed on CPT


In [24]:
#print(model.rnn.W)

In [20]:
"""
my_model = Model(30002,30002,76) 
optimizer2 = tf.train.AdamOptimizer(learning_rate = 0.002)

chkpnt = tfe.Checkpoint(optimizer=optimizer2, model=my_model, optimizer_step=tf.train.get_or_create_global_step())
checkpoint_directory = 'models_checkpoints/common/'
print(chkpnt.restore(tf.train.latest_checkpoint(checkpoint_directory)))
log_msg("Restored CPT Previous Checkpoint")
print(my_model.rnn.W)
"""

'\nmy_model = Model(30002,30002,76) \noptimizer2 = tf.train.AdamOptimizer(learning_rate = 0.002)\n\nchkpnt = tfe.Checkpoint(optimizer=optimizer2, model=my_model, optimizer_step=tf.train.get_or_create_global_step())\ncheckpoint_directory = \'models_checkpoints/common/\'\nprint(chkpnt.restore(tf.train.latest_checkpoint(checkpoint_directory)))\nlog_msg("Restored CPT Previous Checkpoint")\nprint(my_model.rnn.W)\n'

In [21]:

log_msg("Training Started on NLI")
train(1,"nli")
log_msg("Training Completed on NLI")


INFO:tensorflow:Thu Oct 11 16:28:32 2018: Training Started on NLI
INFO:tensorflow:Thu Oct 11 16:28:32 2018: Training Completed on NLI


In [22]:
log_msg("Training Started on NMT")
train(1,'nmt')
log_msg("Training Completed on NMT")

INFO:tensorflow:Thu Oct 11 16:28:32 2018: Training Started on NMT
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.

INFO:tensorflow:Thu Oct 11 16:29:18 2018: Epoch 0: Batch Id 100 NMT Batch Loss: 7.2978
INFO:tensorflow:Thu Oct 11 16:30:06 2018: Epoch 0: Batch Id 200 NMT Batch Loss: 6.9562
INFO:tensorflow:Thu Oct 11 16:30:50 2018: Epoch 0: Batch Id 300 NMT Batch Loss: 6.9820
INFO:tensorflow:Thu Oct 11 16:31:30 2018: Epoch 0: Batch Id 400 NMT Batch Loss: 6.9277
INFO:tensorflow:Thu Oct 11 16:31:31 2018: NMT Training Completed, Saving Final Checkpoint
INFO:tensorflow:Thu Oct 11 16:31:35 2018: Training Started on NMT


In [23]:
!nvidia-smi

Thu Oct 11 16:31:35 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 396.44                 Driver Version: 396.44                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           On   | 0000EA82:00:00.0 Off |                    0 |
| N/A   54C    P0    56W / 149W |  10898MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    