### Pipeline to load Datasets
- NLI Dataset
- NMT Dataset
- Tree Dataset

In [1]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
from collections import OrderedDict
import numpy as np

In [2]:
tf.enable_eager_execution()

In [3]:
TRAIN_FILE = "./gensen/data/corpora/allnli.train.txt.clean.noblank"
EN_FILE = "./english.tok"
DE_FILE = "./german.tok"

#Const. Parsing FILES
CPT_X_FILE = "./tree_data/en.txt.tok.out"
CPT_Y_FILE  = "./tree_data/pt.out"

#Vocab Files
COMMON_VOCAB_FILE = "./words.txt"
GERMAN_VOCAB_FILE = "./german_words.txt"
TREE_VOCAB_FILE = "./tree_words.txt"


In [4]:
# Create Vocabs
vocab = tf.contrib.lookup.index_table_from_file(COMMON_VOCAB_FILE, num_oov_buckets=1)
vocab_german = tf.contrib.lookup.index_table_from_file(GERMAN_VOCAB_FILE,num_oov_buckets=1)
vocab_tree  = tf.contrib.lookup.index_table_from_file(TREE_VOCAB_FILE,num_oov_buckets=1)

In [10]:
#vocab.lookup(tf.constant(["hello","a","great"]))

<tf.Tensor: id=30, shape=(3,), dtype=int64, numpy=array([7844,    7,  197])>

In [60]:
def view_data(dataset,n_rows=4):
    """Function to view any dataset upto first n rows"""
    itr = dataset.make_one_shot_iterator()
    for i in range(n_rows):
            next_item = itr.get_next()
            print(next_item)

In [15]:
def convertToTokens(dataset,vocab):
    """Converts sentences into tokens, maps them to ints, computes each sentence len"""
    dataset = dataset.map(lambda sentence:tf.string_split([sentence]).values)
    dataset = dataset.map(lambda token: { 'sentence': vocab.lookup(token), 'len': tf.size(token) } )    
    #dataset = dataset.map(lambda x:someFn(x))
    return dataset

#Loads data from text file
def load_dataset(filepath,vocab):
    
    dataset = tf.data.TextLineDataset(filepath)
    
    HypothesisData = dataset.map(lambda sentence: tf.string_split([sentence],"\t").values[0])
    PremisesData = dataset.map(lambda sentence: tf.string_split([sentence],"\t").values[1])
    LabelsData = dataset.map(lambda sentence: tf.string_split([sentence],"\t").values[2])
    
    
    HypothesisData = convertToTokens(HypothesisData,vocab)
    PremisesData = convertToTokens(PremisesData,vocab)
    
   
    #LabelsData = convertToLabels(LabelsData)
    return (HypothesisData,PremisesData,LabelsData)
    
    

In [16]:
def createLabelNumpy(L):
    itrH = L.make_one_shot_iterator()

    l = []
    mp = { 'neutral':0,
           'contradiction':1,
           'entailment':2,
            '-':3
         }

    while True:
        try:
            next_item = itrH.get_next().numpy().decode('utf-8')
            l.append(mp[next_item])
        except:
            break
    
    return np.array(l)


In [71]:
def load_NLI_dataset(file,batch_size=32,buffer_size=1024,prefetch_size=5):
    """Accepts NLI Train File Creates Datasets of Hypothesis, Premises and Labels"""
    """Returns Labels as One Hot Vectors"""
    
    global vocab

    H,P,L = load_dataset(file,vocab)
    label_numpy = createLabelNumpy(L)
    labelDataset = tf.data.Dataset.from_tensor_slices(label_numpy)
    depth = 4
    labelDataset = labelDataset.map(lambda x:tf.one_hot(x,4))
    
    HD = H
    PD = P
    LD = labelDataset
    
    dataset = tf.data.Dataset.zip((HD,PD,LD))
    dataset = (dataset
              .shuffle(buffer_size=buffer_size)
               .padded_batch(batch_size=batch_size,padded_shapes = ({'sentence':[None],'len':[]},
                                                                   {'sentence':[None],'len':[]},
                                                                    [None]))
               .prefetch(prefetch_size)
              )
    
    
    
    return dataset

In [72]:
dataset = load_NLI_dataset(TRAIN_FILE)

In [69]:
#view_data(dataset)

In [61]:
def lookup(word,vocab):
    """Accepts word string and vocab, returns id of word in vocab"""
    return vocab.lookup(tf.constant(word)).numpy()

In [22]:
# Crate Dataset for NMT Task

In [48]:
def load_NMT_dataset(file1,file2,buffer_size=1024,prefetch_size=5,batch_size=32):
    """Loads and Processes NMT Dataset"""
    
    dataset1 = tf.data.TextLineDataset(file1)
    dataset2  = tf.data.TextLineDataset(file2)
    
    global vocab
    global vocab_german
    d1 = convertToTokens(dataset1,vocab)
    d2 = convertToTokens(dataset2,vocab_german)
    
    
    dataset_nmt = tf.data.Dataset.zip((d1,d2))
    dataset_nmt = (dataset_nmt
                   .shuffle(buffer_size=buffer_size)
                   .padded_batch(batch_size=batch_size,padded_shapes = ({'sentence':[None],'len':[]},
                                                                   {'sentence':[None],'len':[]}))
                   .prefetch(prefetch_size)
                  )
    
    
    return dataset_nmt

In [49]:
#dataset_nmt  = load_NMT_dataset(EN_FILE,DE_FILE)

In [66]:
#view_data(dataset_nmt)

In [56]:
def load_tree_dataset(file1,file2,buffer_size=1024,prefetch_size=5,batch_size=32):
    
    """Loads and Processes Tree Dataset"""
    dataset1 = tf.data.TextLineDataset(file1)
    dataset2  = tf.data.TextLineDataset(file2)
    
    global vocab
    global vocab_tree
    d1 = convertToTokens(dataset1,vocab)
    d2 = convertToTokens(dataset2,vocab_tree)
    
    
    dataset_tree = tf.data.Dataset.zip((d1,d2))
    dataset_tree = (dataset_tree
                   .shuffle(buffer_size=buffer_size)
                   .padded_batch(batch_size=batch_size,padded_shapes = ({'sentence':[None],'len':[]},
                                                                   {'sentence':[None],'len':[]}))
                   .prefetch(prefetch_size)
                  )
    
    
    
    return dataset_tree

In [63]:
#dataset_tree = load_tree_dataset(CPT_X_FILE,CPT_Y_FILE)

In [70]:
#view_data(dataset_nli)