In [1]:
import pandas as pd
import numpy as np
import gensim
from nltk.corpus import stopwords
import re
import tensorflow as tf

#-----------Global variables
GloveEmbeddings = {}
max_query_words = 12
max_passage_words = 50
emb_dim = 100
totalRowsInFile = 500

  from ._conv import register_converters as _register_converters


In [None]:
data = pd.read_csv("./data.tsv",sep="\t",header=None)

In [2]:
def loadEmbeddings(embeddingfile):
    global GloveEmbeddings,emb_dim

    fe = open(embeddingfile,"r",encoding="utf-8",errors="ignore")
    for line in fe:
        tokens= line.strip().split()
        word = tokens[0]
        vec = list(map(float,tokens[1:]))
        #vec = " ".join(vec)
        GloveEmbeddings[word]=vec
    #Add Zerovec, this will be useful to pad zeros, it is better to experiment with padding any non-zero constant values also.
    #GloveEmbeddings["zerovec"] = "0.0 "*emb_dim
    GloveEmbeddings["zerovec"] = [0.0] *emb_dim
    fe.close()

In [None]:
#[float(word) for word in GloveEmbeddings["word"].split()]
#GloveEmbeddings["word"] + GloveEmbeddings["test"]

In [3]:
#----------Create multiple files for 1000 rows
def removeStopWordsAndGenerateEmbeddings(inputfile,outputfile,isEvaluation):
    def _bytes_feature(value):
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.compat.as_bytes(value)]))

    def _int64_feature(value):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
    
    def _floatList_feature(value):
        return tf.train.Feature(float_list=tf.train.FloatList(value=value))
    
    import tensorflow as tf
    global GloveEmbeddings,emb_dim,max_query_words,max_passage_words,totalRowsInFile
    
    f = open(inputfile,"r",encoding="utf-8",errors="ignore")  # Format of the file : query_id \t query \t passage \t label \t passage_id
    
    opts = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.ZLIB)
    stop_words = set(stopwords.words('english')) 
    
    fileCounter = 1 
    currentRowNum = 0
    checkpointRowNumber = 0
    
    #---------Create writer for first file
    writer = tf.python_io.TFRecordWriter(outputfile.format(str(fileCounter)),opts)
    
    #---------Start reading input file
    for line in f:
        tokens = line.strip().lower().split("\t")
        query_id,query,passage,label = [int(tokens[0])],tokens[1],tokens[2],int(tokens[3])

        wordsWithoutPunctuation = re.split('\W+', query)
        words = [word for word in wordsWithoutPunctuation if word not in stop_words and len(word) > 0]
        if len(words) == 0:
            words = wordsWithoutPunctuation

        word_count = len(words)
        remaining = max_query_words - word_count  
        if(remaining>0):
            words += ["zerovec"]*remaining # Pad zero vecs if the word count is less than max_query_words
        words = words[:max_query_words] # trim extra words
        #create Query Feature vector 
        query_feature_vector = []
        for word in words:
            if(word in GloveEmbeddings):
                query_feature_vector += GloveEmbeddings[word]
            else:
                query_feature_vector += GloveEmbeddings["zerovec"]  #Add zerovec for OOV terms
        #query_feature_vector = np.array(query_feature_vector.strip().split())
        #print(len(query_feature_vector))

        #--------------Passage processing
        wordsWithoutPunctuation = re.split('\W+', passage)
        words = [word for word in wordsWithoutPunctuation if word not in stop_words and len(word) > 0]
        if len(words) == 0:
            words = wordsWithoutPunctuation

        word_count = len(words)
        remaining = max_passage_words - word_count  
        if(remaining>0):
            words += ["zerovec"]*remaining # Pad zero vecs if the word count is less than max_passage_words
        words = words[:max_passage_words] # trim extra words
        #create Passage Feature vector 
        passage_feature_vector = []
        for word in words:
            if(word in GloveEmbeddings):
                passage_feature_vector += GloveEmbeddings[word]
            else:
                passage_feature_vector += GloveEmbeddings["zerovec"]  #Add zerovec for OOV terms
        #passage_feature_vector = np.array(passage_feature_vector.strip().split())
        #print(len(passage_feature_vector))

        #----------label Processing
        if(not isEvaluation):
            labelFeatureVector =[0,0]
            labelFeatureVector[label] = 1

            example = tf.train.Example(features=tf.train.Features(feature={
            'query': _floatList_feature(query_feature_vector),
            'passage': _floatList_feature(passage_feature_vector),
            'label': _int64_feature(labelFeatureVector)
            }))

            #fw.write(query_feature_vector + "," + passage_feature_vector + "," + str(label))
        else:
            #fw.write(query_feature_vector + "," + passage_feature_vector + "," + str(query_id))
            example = tf.train.Example(features=tf.train.Features(feature={
            'query': _floatList_feature(query_feature_vector),
            'passage': _floatList_feature(passage_feature_vector),
            'query_id': _int64_feature(query_id),
            'passage_id': _int64_feature([label])
            }))

        writer.write(example.SerializeToString())
        currentRowNum += 1

        #-------Close the file and update the variables
        if currentRowNum == totalRowsInFile:
            writer.close()
            currentRowNum = 0
            fileCounter += 1
            writer = tf.python_io.TFRecordWriter(outputfile.format(str(fileCounter)),opts)
    writer.close()

In [None]:
def splitDataset(dataFileName,trainFilename, validationFileName):
    import pandas as pd
    from sklearn.model_selection import train_test_split
    data = pd.read_csv(dataFileName, header = None, sep = "\t")
    train, test = train_test_split(data, test_size=0.10, random_state=42)
    train.to_csv(trainFilename,index = False,header = None,sep = "\t")
    test.to_csv(validationFileName,index = False,header = None,sep = "\t")

In [5]:
loadEmbeddings("./glove.6B.100d.txt")
#splitDataset("./data.tsv","./trainData.tsv","./ValidationData.tsv")
#removeStopWordsAndGenerateEmbeddings("./trainData.tsv","./TrainData/trainEmbeddings_{}.tfrecords",isEvaluation=False)
#removeStopWordsAndGenerateEmbeddings("./ValidationData.tsv","./ValidationData/validationEmbeddings_{}.tfrecords",isEvaluation=False)
removeStopWordsAndGenerateEmbeddings("./eval1_unlabelled.tsv","./TestData/evalUnlabelledEmbeddings_{}.tfrecords",isEvaluation=True)