In [1]:
!pip install PyDrive

import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

download = drive.CreateFile({'id': '1NVQj3GqWFC1i6iLFjjKBFo1XbXm4tv2J'})
download.GetContentFile('10k.tsv')

download = drive.CreateFile({'id': '1hQU8hmAgBTDDzvYJkl5QvWBrJv2mNlko'})
download.GetContentFile('2ke.tsv')

download = drive.CreateFile({'id': '1Gr7goLc6blrTl_xTlpvkuCsp_vSyZwD7'})
download.GetContentFile('glove.6B.50d.txt')

Collecting PyDrive
  Downloading https://files.pythonhosted.org/packages/6b/2d/c8e052ba51099faee0bfe71d84f35bb1576e6910483cad46b840a122ca6c/PyDrive-1.3.1-py2-none-any.whl
Installing collected packages: PyDrive
Successfully installed PyDrive-1.3.1


In [0]:
upload = drive.CreateFile({'title': 'EvaluationData.ctf'})
upload.SetContentFile('EvaluationData.ctf')
upload.Upload()

upload = drive.CreateFile({'title': 'TrainData.ctf'})
upload.SetContentFile('TrainData.ctf')
upload.Upload()

upload = drive.CreateFile({'title': 'ValidationData.ctf'})
upload.SetContentFile('ValidationData.ctf')
upload.Upload()

In [2]:
!pip  install cntk

!head -9000 10k.tsv > traindata.tsv
!wc -l traindata.tsv
!tail -1000 10k.tsv > validationdata.tsv
!mv 2ke.tsv eval1_unlabelled.tsv

import re

#Initialize Global variables 
GloveEmbeddings = {}
max_query_words = 12
max_passage_words = 50
emb_dim = 50
#The following method takes Glove Embedding file and stores all words and their embeddings in a dictionary
def loadEmbeddings(embeddingfile):
    global GloveEmbeddings,emb_dim

    fe = open(embeddingfile,"r")
    for line in fe:
        tokens= line.strip().split()
        word = tokens[0]
        vec = tokens[1:]
        vec = " ".join(vec)
        GloveEmbeddings[word]=vec
    #Add Zerovec, this will be useful to pad zeros, it is better to experiment with padding any non-zero constant values also.
    GloveEmbeddings["zerovec"] = "0.0 "*emb_dim
    fe.close()


def TextDataToCTF(inputfile,outputfile,isEvaluation):
    global GloveEmbeddings,emb_dim,max_query_words,max_passage_words

    f = open(inputfile,"r")  # Format of the file : query_id \t query \t passage \t label \t passage_id
    fw = open(outputfile,"w")
    for line in f:
        tokens = line.strip().lower().split("\t")
        query_id,query,passage,label = tokens[0],tokens[1],tokens[2],tokens[3]

        #****Query Processing****
        words = re.split('\W+', query)
        words = [x for x in words if x] # to remove empty words 
        word_count = len(words)
        remaining = max_query_words - word_count  
        if(remaining>0):
            words += ["zerovec"]*remaining # Pad zero vecs if the word count is less than max_query_words
        words = words[:max_query_words] # trim extra words
        #create Query Feature vector 
        query_feature_vector = ""
        for word in words:
            if(word in GloveEmbeddings):
                query_feature_vector += GloveEmbeddings[word]+" "
            else:
                query_feature_vector += GloveEmbeddings["zerovec"]+" "  #Add zerovec for OOV terms
        query_feature_vector = query_feature_vector.strip() 

        #***** Passage Processing **********
        words = re.split('\W+', passage)
        words = [x for x in words if x] # to remove empty words 
        word_count = len(words)
        remaining = max_passage_words - word_count  
        if(remaining>0):
            words += ["zerovec"]*remaining # Pad zero vecs if the word count is less than max_passage_words
        words = words[:max_passage_words] # trim extra words
        #create Passage Feature vector 
        passage_feature_vector = ""
        for word in words:
            if(word in GloveEmbeddings):
                passage_feature_vector += GloveEmbeddings[word]+" "
            else:
                passage_feature_vector += GloveEmbeddings["zerovec"]+" "  #Add zerovec for OOV terms
        passage_feature_vector = passage_feature_vector.strip() 

        #convert label
        label_str = " 1 0 " if label=="0" else " 0 1 " 

        if(not isEvaluation):
            fw.write("|qfeatures "+query_feature_vector+" |pfeatures "+passage_feature_vector+" |labels "+label_str+"\n")
        else:
            fw.write("|qfeatures "+query_feature_vector+" |pfeatures "+passage_feature_vector+"|qid "+str(query_id)+"\n")

trainFileName = "traindata.tsv"
validationFileName = "validationdata.tsv"
EvaluationFileName = "eval1_unlabelled.tsv"

embeddingFileName = "glove.6B.50d.txt"

loadEmbeddings(embeddingFileName)    

# Convert Query,Passage Text Data to CNTK Text Format(CTF) using 50-Dimension Glove word embeddings 
TextDataToCTF(trainFileName,"TrainData.ctf",False)
print("Train Data conversion is done")
TextDataToCTF(validationFileName,"ValidationData.ctf",False)
print("Validation Data conversion is done")
TextDataToCTF(EvaluationFileName,"EvaluationData.ctf",True)
print("Evaluation Data conversion is done")

Collecting cntk
[?25l  Downloading https://files.pythonhosted.org/packages/fe/26/f4b6ed23df63ff64b5921b02125f6d0ee4ea92262ee146bce56ed6b449f9/cntk-2.6-cp27-cp27mu-manylinux1_x86_64.whl (74.8MB)
[K    100% |████████████████████████████████| 74.8MB 304kB/s 
Installing collected packages: cntk
Successfully installed cntk-2.6
9000 traindata.tsv
Train Data conversion is done
Validation Data conversion is done
Evaluation Data conversion is done


In [12]:
from __future__ import print_function
import numpy as np
import sys
import os
import cntk as C
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
from sklearn.metrics import precision_recall_fscore_support
from sklearn.exceptions import UndefinedMetricWarning
import warnings
warnings.filterwarnings("ignore", category=UndefinedMetricWarning) 

ImportError: ignored

In [0]:
!export LD_LIBRARY_PATH=/usr/local/lib

In [22]:
!python -c "import sys; print(sys.path)"

['', '/env/python', '/usr/lib/python2.7', '/usr/lib/python2.7/plat-x86_64-linux-gnu', '/usr/lib/python2.7/lib-tk', '/usr/lib/python2.7/lib-old', '/usr/lib/python2.7/lib-dynload', '/usr/local/lib/python2.7/dist-packages', '/usr/lib/python2.7/dist-packages']
