In [1]:
import numpy as np
import pandas as pd
import csv
from tempfile import mkdtemp
import os.path as path
from scipy.sparse import csr_matrix, isspmatrix
import json
import io
import re
from time import gmtime, strftime
from datetime import datetime
import shlex, subprocess
import time
import glob
from krovetzstemmer import Stemmer
import string
from collections import Counter
import itertools
import random
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

## Build Methods

In [2]:
def prepare_question_text(question_text):
    question_text = question_text.replace('?','') #remove ?
    question_text = question_text.replace("'",'')
    question_text = question_text.replace('"','')
    question_text = question_text.replace('-',' ')
    question_text = question_text.replace('(','')
    question_text = question_text.replace(')','')
    question_text = question_text.replace(',','')
    question_text = question_text.replace('.','')
    question_text = question_text.replace('&',' and ')
    question_text = question_text.replace(':','')
    question_text = question_text.replace('>','')#error in dataset
    question_text = question_text.replace('/',' ')

    return question_text

In [3]:
## Build a dataframe of query number and query for convenience 
queryDF = pd.DataFrame(columns=['queryNum', 'query'])
i = 0
with open('query.txt', 'w') as file_to_write:
    with open('Topic', 'r') as file_to_read:
        for line in file_to_read:
            if line.startswith("<num>"):
                queryNum = re.search(r'\d+', line)
            if line.startswith("<title>"):
                if(line.split("<title>",1)[1][:-1] == ""):
                    queryDF.loc[i] = [int(queryNum.group()), prepare_question_text(next(file_to_read)[:-1])]
                    i = i+1
                else:
                    queryDF.loc[i] = [int(queryNum.group()), prepare_question_text(line.split("<title> ",1)[1][:-1])]
                    i = i+1
    file_to_read.close()
file_to_write.close()


Unnamed: 0,queryNum,query
0,301,International Organized Crime
1,302,Poliomyelitis and Post Polio
2,303,Hubble Telescope Achievements
3,304,Endangered Species Mammals
4,305,Most Dangerous Vehicles
5,306,African Civilian Deaths
6,307,New Hydroelectric Projects
7,308,Implant Dentistry
8,309,Rap and Crime
9,310,Radio Waves and Brain Cancer


## Make QUERY File and run Baseline Retrieval (in cmd)

In [38]:
with open('query.txt', 'w') as file_to_write:
    file_to_write.write("<parameter>\n")
    with open('Topic', 'r') as file_to_read:
#         table = str.maketrans('', '', string.punctuation)
#         sentences = [[w.translate(table) for w in x if not w in stop] for x in tokens]
        for line in file_to_read:
            
            if line.startswith("<num>"):
                file_to_write.write("<query>\n")
                queryNum = re.search(r'\d+', line)
                file_to_write.write('<number>'+ queryNum.group()+'</number>\n')
#                 print(queryNum.group())
            if line.startswith("<title>"):
                if(line.split("<title>",1)[1][:-1] == ""):
                   file_to_write.write('<text>'+  prepare_question_text(next(file_to_read)[:-1])  + '</text>\n')
                else:
                    
                    file_to_write.write('<text>'+ line.split("<title>",1)[1][:-1] + '</text>\n')
                file_to_write.write("</query>\n")
    # if it starts with <num>, extract the qnum
            # if it starts with <title>, extract the title
            # break from while loop
    file_to_read.close()
    file_to_write.write("</parameter>\n")
file_to_write.close()
        # write <query>
        # write the num
        # write the text
        # write </query>

## GLOBAL EMBEDDING

In [4]:
words = pd.read_table(r'C:\Users\nroy0\Downloads\Thesis\glove.6B\glove.6B.50d.txt', sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)
words100 = pd.read_table(r'C:\Users\nroy0\Downloads\Thesis\glove.6B\glove.6B.100d.txt', sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)
words200 = pd.read_table(r'C:\Users\nroy0\Downloads\Thesis\glove.6B\glove.6B.200d.txt', sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)
words300 = pd.read_table(r'C:\Users\nroy0\Downloads\Thesis\glove.6B\glove.6B.300d.txt', sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

In [5]:
stop = json.load(open(r'C:\Users\nroy0\Downloads\Thesis\glove.6B\SMARTstop.json'))

In [6]:
def vec(w):
    return words.loc[w].as_matrix()

In [7]:
wordList = words.index.values
word100List = words100.index.values
word200ListS = words200.index.values
word300ListS = words300.index.values
U = words100.as_matrix()


In [8]:
def candidateTerms( question,k):
    filtered_text = removeStopWords(question)
    q = binaryEncoding( filtered_text)
    tmp = np.matmul(U.T,q)
    candidate = np.matmul(U,tmp)
    index = np.argsort(-candidate)[:k]
    weights = -np.sort(-candidate)[:k]
    normW = normalise(weights)
    expansionTerms = np.asarray(word100List)[index]
    pqPlus = dict(zip(expansionTerms,normW))
    return pqPlus

In [9]:
def removeStopWords(questionText):
    return [w.lower() for w in questionText.split()  if not w in stop]

In [10]:
def binaryEncoding(qText):
    questionList = qText
    binaryEncoded =np.array([ 1 if i in questionList else 0 for i in word100List])
    return binaryEncoded

In [11]:
def normalise(weights):
    sumW = np.sum(weights)
    normW = weights/sumW
    return normW

In [12]:
def queryFormat(lamda, question_text, arr):
    expansionTerms = convertExpansionTerms(question_text, arr)
    filtered_text = removeStopWords(question_text)
    finalQueryText = '<text>#weight( ' + str(lamda)+' #combine ( '+ ' '.join(word for word in filtered_text) + " ) " + str(1 - lamda) + " #weight( " + expansionTerms + ") )</text>"
    return finalQueryText

In [13]:
def convertExpansionTerms(question_text, arr):
    onlyUniqueWords = [i for i in list(arr) if i.isalpha()]
#     onlyUniqueWords = np.setdiff1d(onlyWords, question_text.split())
    weightO = normalise([arr[x] for x in onlyUniqueWords])
    testStr = ""
    for item in range(len(onlyUniqueWords)):
        testStr = testStr + str(round(weightO[item],4))+" "+str(onlyUniqueWords[item]) + " "
    return testStr

In [100]:
arr = candidateTerms('International Organized Crime', 50)
queryFormat(0.5, 'International Organized Crime', arr)

'<text>#weight( 0.5 #combine ( international organized crime ) 0.5 #weight( 0.0232 international 0.0223 crime 0.0217 organizations 0.0215 groups 0.0214 activities 0.0212 organization 0.0211 criminal 0.021 countries 0.0207 terrorism 0.0207 crimes 0.0206 economic 0.0206 political 0.0205 government 0.0203 trafficking 0.0203 terrorist 0.0203 military 0.0201 security 0.02 corruption 0.02 rights 0.02 cooperation 0.0199 national 0.0199 anti 0.0198 group 0.0198 department 0.0198 law 0.0198 police 0.0197 public 0.0197 violence 0.0197 world 0.0197 enforcement 0.0196 nations 0.0196 agencies 0.0196 islamic 0.0195 regional 0.0194 organized 0.0193 social 0.0193 efforts 0.0193 officials 0.0192 laws 0.0192 authorities 0.0192 drug 0.0192 investigation 0.0191 european 0.0191 commission 0.0191 global 0.019 country 0.019 financial 0.019 action 0.019 china 0.019 civil ) )</text>'

In [15]:
def originalGlobal(expansion_terms, lamda, X_train, mode, cv):
    if mode == 'train':
        queryFile = 'C:/Users/nroy0/Downloads/Thesis/aquaint/Vol45/Global100Queries/train/' + str(cv)+'/query_global_'+str(lamda) + '_' + str(expansion_terms) +'.txt'
    elif mode == 'test':
        queryFile = 'C:/Users/nroy0/Downloads/Thesis/aquaint/Vol45/Global100Queries/test/query_global_'+str(lamda) + '_' + str(expansion_terms) + '_' + str(cv) +'.txt'

     
    with io.open(queryFile, 'w',  encoding="utf-8") as file_to_write:
        file_to_write.write("<parameter>\n")
        for i in range(len(X_train)):
#             print(i)
            file_to_write.write("<query>\n")
            file_to_write.write('<number>'+ str(X_train['queryNum'][i])+'</number>\n')
            question_text =prepare_question_text(X_train['query'][i])
            arr = candidateTerms(question_text , expansion_terms)
            finalQueryText = queryFormat(lamda, question_text, arr)
            file_to_write.write(finalQueryText + '\n')
            with open(r'C:\Users\nroy0\Downloads\Thesis\trec_eval.9.0\original\output\baselineQL_[1000].indri') as indrifile:
                for output in indrifile:
                    if output[:3] == str(X_train['queryNum'][i]):
                        doc = re.search(r'Q0 (.*?) ', output).group(1)
                        file_to_write.write('<workingSetDocno>'+doc+ '</workingSetDocno>\n')
            indrifile.close()
            file_to_write.write("</query>\n")

#         file_to_read.close()
        file_to_write.write("</parameter>\n")
    file_to_write.close()
    return queryFile

## Cross Validation

In [16]:
kf = KFold( n_splits = 5, shuffle = True)
cv = 0
queryTestindex = {}
for train_index,test_index in kf.split(queryDF):
    cv = cv+1
    X_train, X_test = queryDF.loc[train_index].reset_index(), queryDF.loc[test_index].reset_index()
    queryTestindex[cv]= test_index
    for expansion_terms in [ 50, 100, 200]:
        for lamda in [  0.3, 0.45, 0.6, 0.75]: 
            startTime = time.time()
            queryTrainFile = originalGlobal(expansion_terms, lamda, X_train, 'train', cv)
            
#             queryTestFile = originalGlobal(expansion_terms, lamda, X_test, 'test', cv)
            endTime = time.time()
            t_sec = round(endTime - startTime)
            (t_min, t_sec) = divmod(t_sec,60)
            print('Query File for Lamda {} and expansion terms {} created, time taken {} minutes and {} seconds'.format(lamda, expansion_terms, t_min, t_sec))
            for mu in [ 500, 750, 1000, 1500, 2000 ]:
                startTime = time.time()
                outputFile = str(cv)+'/QL_['+str(mu)+'_'+str(lamda)+'_'+str(expansion_terms)+'].indri'
                command_input= r'"C:\Users\nroy0\Downloads\Thesis\Lemur\bin\IndriRunQuery.exe" '+str(queryTrainFile)+' -index=Vol45/index -count=1000 -trecFormat=true -queryOffset=-1 -rule=method:dirichlet,mu:' +str(mu)+' -stemmer.name=krovetz > C:/Users/nroy0/Downloads/Thesis/trec_eval.9.0/original/global100/'+str(outputFile) 
                args = shlex.split(command_input)
                p = subprocess.run(args, stderr=subprocess.STDOUT,shell=True)
                endTime = time.time()
                t_sec = round(endTime - startTime)
                (t_min, t_sec) = divmod(t_sec,60)
                print("Mu {} complete, time take {} min and {} sec".format(mu, t_min, t_sec ))
        print("Cross Validation {} Complete".format(cv))

Query File for Lamda 0.3 and expansion terms 50 created, time taken 8 minutes and 49 seconds
Mu 500 complete, time take 0 min and 25 sec
Mu 750 complete, time take 0 min and 23 sec
Mu 1000 complete, time take 0 min and 23 sec
Mu 1500 complete, time take 0 min and 23 sec
Mu 2000 complete, time take 0 min and 23 sec
Query File for Lamda 0.45 and expansion terms 50 created, time taken 8 minutes and 45 seconds
Mu 500 complete, time take 0 min and 23 sec
Mu 750 complete, time take 0 min and 23 sec
Mu 1000 complete, time take 0 min and 23 sec
Mu 1500 complete, time take 0 min and 23 sec
Mu 2000 complete, time take 0 min and 23 sec
Query File for Lamda 0.6 and expansion terms 50 created, time taken 8 minutes and 31 seconds
Mu 500 complete, time take 0 min and 22 sec
Mu 750 complete, time take 0 min and 23 sec
Mu 1000 complete, time take 0 min and 21 sec
Mu 1500 complete, time take 0 min and 23 sec
Mu 2000 complete, time take 0 min and 21 sec
Query File for Lamda 0.75 and expansion terms 50 cr

Mu 750 complete, time take 0 min and 24 sec
Mu 1000 complete, time take 0 min and 23 sec
Mu 1500 complete, time take 0 min and 22 sec
Mu 2000 complete, time take 0 min and 22 sec
Query File for Lamda 0.6 and expansion terms 50 created, time taken 8 minutes and 28 seconds
Mu 500 complete, time take 0 min and 23 sec
Mu 750 complete, time take 0 min and 23 sec
Mu 1000 complete, time take 0 min and 22 sec
Mu 1500 complete, time take 0 min and 22 sec
Mu 2000 complete, time take 0 min and 22 sec
Query File for Lamda 0.75 and expansion terms 50 created, time taken 8 minutes and 30 seconds
Mu 500 complete, time take 0 min and 22 sec
Mu 750 complete, time take 0 min and 22 sec
Mu 1000 complete, time take 0 min and 22 sec
Mu 1500 complete, time take 0 min and 21 sec
Mu 2000 complete, time take 0 min and 23 sec
Cross Validation 3 Complete
Query File for Lamda 0.3 and expansion terms 100 created, time taken 8 minutes and 15 seconds
Mu 500 complete, time take 0 min and 42 sec
Mu 750 complete, time 

Mu 2000 complete, time take 0 min and 23 sec
Query File for Lamda 0.75 and expansion terms 50 created, time taken 8 minutes and 14 seconds
Mu 500 complete, time take 0 min and 23 sec
Mu 750 complete, time take 0 min and 23 sec
Mu 1000 complete, time take 0 min and 23 sec
Mu 1500 complete, time take 0 min and 23 sec
Mu 2000 complete, time take 0 min and 23 sec
Cross Validation 5 Complete
Query File for Lamda 0.3 and expansion terms 100 created, time taken 8 minutes and 16 seconds
Mu 500 complete, time take 0 min and 41 sec
Mu 750 complete, time take 0 min and 41 sec
Mu 1000 complete, time take 0 min and 41 sec
Mu 1500 complete, time take 0 min and 41 sec
Mu 2000 complete, time take 0 min and 41 sec
Query File for Lamda 0.45 and expansion terms 100 created, time taken 8 minutes and 14 seconds
Mu 500 complete, time take 0 min and 41 sec
Mu 750 complete, time take 0 min and 41 sec
Mu 1000 complete, time take 0 min and 41 sec
Mu 1500 complete, time take 0 min and 41 sec
Mu 2000 complete, ti

In [29]:

path = r'C:\Users\nroy0\Downloads\Thesis\trec_eval.9.0\original\global100\5'
filenames = glob.glob(path + "/*.csv")
dfs = []
count = 0
for filename in filenames:
    match = re.search(r'\w\w\_\[\d\.?\d*\_\d\.?\d*\_\d+\]', filename)
    count = count + 1
#     print(count)
    colname = match.group()
    df = pd.read_table(filename,sep='\t', index_col=None, header=None, usecols=[2], skiprows =1, names =  [colname] , quoting=csv.QUOTE_NONE)
    dfs.append(df)
resultQL = pd.concat(dfs, axis=1, ignore_index = False)

resultQL['index']=[ 'num_q',
'num_ret',
'num_rel',
'num_rel_ret',
'map',
'gm_map',
'Rprec',
'bpref',
'recip_rank',
'iprec_at_recall_0.00',
'iprec_at_recall_0.10',
'iprec_at_recall_0.20',
'iprec_at_recall_0.30',
'iprec_at_recall_0.40',
'iprec_at_recall_0.50',
'iprec_at_recall_0.60',
'iprec_at_recall_0.70',
'iprec_at_recall_0.80',
'iprec_at_recall_0.90',
'iprec_at_recall_1.00',
'P_5',
'P_10',
'P_15',
'P_20',
'P_30',
'P_100',
'P_200',
'P_500',
'P_1000',
'recall_5',
'recall_10',
'recall_15',
'recall_20',
'recall_30',
'recall_100',
'recall_200',
'recall_500',
'recall_1000',
'infAP',
'gm_bpref',
'Rprec_mult_0.20',
'Rprec_mult_0.40',
'Rprec_mult_0.60',
'Rprec_mult_0.80',
'Rprec_mult_1.00',
'Rprec_mult_1.20',
'Rprec_mult_1.40',
'Rprec_mult_1.60',
'Rprec_mult_1.80',
'Rprec_mult_2.00',
'utility',
'11pt_avg',
'binG',
'G',
'ndcg',
'ndcg_rel',
'Rndcg',
'ndcg_cut_5',
'ndcg_cut_10',
'ndcg_cut_15',
'ndcg_cut_20',
'ndcg_cut_30',
'ndcg_cut_100',
'ndcg_cut_200',
'ndcg_cut_500',
'ndcg_cut_1000',
'map_cut_5',
'map_cut_10',
'map_cut_15',
'map_cut_20',
'map_cut_30',
'map_cut_100',
'map_cut_200',
'map_cut_500',
'map_cut_1000',
'relative_P_5',
'relative_P_10',
'relative_P_15',
'relative_P_20',
'relative_P_30',
'relative_P_100',
'relative_P_200',
'relative_P_500',
'relative_P_1000',
'success_1',
'success_5',
'success_10',
'set_P',
'set_relative_P',
'set_recall',
'set_map',
'set_F',
'num_nonrel_judged_ret']
resultQL = resultQL.set_index('index')
resultQL

Unnamed: 0_level_0,QL_[1000_0.3_100],QL_[1000_0.3_200],QL_[1000_0.3_50],QL_[1000_0.45_100],QL_[1000_0.45_200],QL_[1000_0.45_50],QL_[1000_0.6_100],QL_[1000_0.6_200],QL_[1000_0.6_50],QL_[1000_0.75_100],...,QL_[750_0.3_50],QL_[750_0.45_100],QL_[750_0.45_200],QL_[750_0.45_50],QL_[750_0.6_100],QL_[750_0.6_200],QL_[750_0.6_50],QL_[750_0.75_100],QL_[750_0.75_200],QL_[750_0.75_50]
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
num_q,199.0000,199.0000,199.0000,199.0000,199.0000,199.0000,199.0000,199.0000,199.0000,199.0000,...,199.0000,199.0000,199.0000,199.0000,199.0000,199.0000,199.0000,199.0000,199.0000,199.0000
num_ret,192035.0000,192035.0000,192035.0000,192035.0000,192035.0000,192035.0000,192035.0000,192035.0000,192035.0000,192035.0000,...,192035.0000,192035.0000,192035.0000,192035.0000,192035.0000,192035.0000,192035.0000,192035.0000,192035.0000,192035.0000
num_rel,13369.0000,13369.0000,13369.0000,13369.0000,13369.0000,13369.0000,13369.0000,13369.0000,13369.0000,13369.0000,...,13369.0000,13369.0000,13369.0000,13369.0000,13369.0000,13369.0000,13369.0000,13369.0000,13369.0000,13369.0000
num_rel_ret,7909.0000,7909.0000,7909.0000,7909.0000,7909.0000,7909.0000,7909.0000,7909.0000,7909.0000,7909.0000,...,7909.0000,7909.0000,7909.0000,7909.0000,7909.0000,7909.0000,7909.0000,7909.0000,7909.0000,7909.0000
map,0.2423,0.2472,0.2329,0.2556,0.2562,0.2532,0.2590,0.2577,0.2586,0.2591,...,0.2313,0.2538,0.2536,0.2518,0.2580,0.2568,0.2583,0.2578,0.2572,0.2587
gm_map,0.1416,0.1443,0.1339,0.1514,0.1510,0.1505,0.1525,0.1509,0.1529,0.1514,...,0.1318,0.1486,0.1479,0.1479,0.1511,0.1497,0.1519,0.1497,0.1489,0.1508
Rprec,0.2866,0.2915,0.2793,0.2978,0.3000,0.2981,0.3034,0.3024,0.3032,0.3017,...,0.2758,0.2964,0.2960,0.2968,0.3034,0.3007,0.3027,0.3013,0.3012,0.3027
bpref,0.2530,0.2559,0.2454,0.2638,0.2638,0.2628,0.2666,0.2656,0.2664,0.2664,...,0.2447,0.2618,0.2610,0.2620,0.2661,0.2647,0.2667,0.2658,0.2650,0.2669
recip_rank,0.6962,0.6916,0.6435,0.7155,0.7130,0.7022,0.7154,0.7038,0.7106,0.7071,...,0.6469,0.7101,0.7039,0.6813,0.7121,0.7009,0.7047,0.7107,0.7065,0.7020
iprec_at_recall_0.00,0.7318,0.7324,0.6923,0.7587,0.7531,0.7491,0.7538,0.7484,0.7500,0.7538,...,0.6921,0.7525,0.7456,0.7266,0.7540,0.7442,0.7481,0.7517,0.7472,0.7446


Best Parameter on Each Cross Validation Training Fold

In [30]:
resultQL.loc['ndcg_cut_10'].idxmax(axis=1)

'QL_[1000_0.45_200]'

{1: array([  1,  18,  20,  25,  29,  34,  42,  45,  59,  67,  70,  75,  82,
         89,  92, 100, 106, 117, 121, 122, 124, 126, 129, 132, 136, 139,
        146, 152, 153, 161, 162, 179, 186, 188, 189, 191, 192, 204, 208,
        209, 212, 213, 214, 218, 221, 222, 238, 242, 247, 249]),
 2: array([  2,   6,  11,  13,  16,  22,  31,  35,  39,  41,  49,  50,  53,
         58,  62,  69,  77,  78,  84,  87,  97,  99, 104, 114, 123, 125,
        128, 131, 147, 148, 155, 156, 157, 159, 165, 171, 172, 173, 178,
        190, 197, 198, 210, 227, 228, 231, 235, 237, 239, 248]),
 3: array([  5,   7,  12,  19,  21,  23,  24,  28,  32,  33,  48,  55,  56,
         57,  60,  85,  96, 101, 107, 109, 111, 113, 115, 116, 119, 133,
        134, 135, 143, 150, 151, 158, 169, 170, 177, 183, 184, 185, 187,
        200, 206, 207, 215, 216, 217, 223, 225, 226, 233, 236]),
 4: array([  9,  10,  27,  36,  40,  46,  47,  54,  63,  64,  66,  71,  72,
         73,  80,  83,  88,  93,  94,  95, 102, 103, 105, 108, 

## Testing on each cross validation test fold with the respective best parameter from the training fold

In [31]:
# queryTestindex[cv]= test_index
cv =5
X_test = queryDF.loc[queryTestindex[cv]].reset_index()
for expansion_terms in [ 200]:
    for lamda in [ 0.45]: 
        startTime = time.time()
#             queryTrainFile = originalGlobal(expansion_terms, lamda, X_train, 'train', cv)

        queryTestFile = originalGlobal(expansion_terms, lamda, X_test, 'test', cv)
        endTime = time.time()
        t_sec = round(endTime - startTime)
        (t_min, t_sec) = divmod(t_sec,60)
        print('Query File for Lamda {} and expansion terms {} created, time taken {} minutes and {} seconds'.format(lamda, expansion_terms, t_min, t_sec))
        for mu in [ 1000]:
            startTime = time.time()
            outputFile = 'QL_['+str(mu)+'_'+str(lamda)+'_'+str(expansion_terms)+'_'+str(cv) + '].indri'
            command_input= r'"C:\Users\nroy0\Downloads\Thesis\Lemur\bin\IndriRunQuery.exe" '+str(queryTestFile)+' -index=Vol45/index -count=1000 -trecFormat=true -queryOffset=-1 -rule=method:dirichlet,mu:' +str(mu)+' -stemmer.name=krovetz > C:/Users/nroy0/Downloads/Thesis/trec_eval.9.0/original/global100/test/'+str(outputFile) 
            args = shlex.split(command_input)
            p = subprocess.run(args, stderr=subprocess.STDOUT,shell=True)
            endTime = time.time()
            t_sec = round(endTime - startTime)
            (t_min, t_sec) = divmod(t_sec,60)
            print("Mu {} complete, time take {} min and {} sec".format(mu, t_min, t_sec ))

Query File for Lamda 0.45 and expansion terms 200 created, time taken 2 minutes and 10 seconds
Mu 1000 complete, time take 0 min and 22 sec


In [40]:

path = r'C:\Users\nroy0\Downloads\Thesis\trec_eval.9.0\original\global100\test'
filenames = glob.glob(path + "/*.csv")
dfs = []
count = 0
for filename in filenames:
    match = re.search(r'\w\w\_\[\d\.?\d*\_\d\.?\d*\_\d+\_\d+\]', filename)
    count = count + 1
#     print(count)
    colname = match.group()
    df = pd.read_table(filename,sep='\t', index_col=None, header=None, usecols=[2], skiprows =1, names =  [colname] , quoting=csv.QUOTE_NONE)
    dfs.append(df)
resultQLTest = pd.concat(dfs, axis=1, ignore_index = False)

resultQLTest['index']=[ 'num_q',
'num_ret',
'num_rel',
'num_rel_ret',
'map',
'gm_map',
'Rprec',
'bpref',
'recip_rank',
'iprec_at_recall_0.00',
'iprec_at_recall_0.10',
'iprec_at_recall_0.20',
'iprec_at_recall_0.30',
'iprec_at_recall_0.40',
'iprec_at_recall_0.50',
'iprec_at_recall_0.60',
'iprec_at_recall_0.70',
'iprec_at_recall_0.80',
'iprec_at_recall_0.90',
'iprec_at_recall_1.00',
'P_5',
'P_10',
'P_15',
'P_20',
'P_30',
'P_100',
'P_200',
'P_500',
'P_1000',
'recall_5',
'recall_10',
'recall_15',
'recall_20',
'recall_30',
'recall_100',
'recall_200',
'recall_500',
'recall_1000',
'infAP',
'gm_bpref',
'Rprec_mult_0.20',
'Rprec_mult_0.40',
'Rprec_mult_0.60',
'Rprec_mult_0.80',
'Rprec_mult_1.00',
'Rprec_mult_1.20',
'Rprec_mult_1.40',
'Rprec_mult_1.60',
'Rprec_mult_1.80',
'Rprec_mult_2.00',
'utility',
'11pt_avg',
'binG',
'G',
'ndcg',
'ndcg_rel',
'Rndcg',
'ndcg_cut_5',
'ndcg_cut_10',
'ndcg_cut_15',
'ndcg_cut_20',
'ndcg_cut_30',
'ndcg_cut_100',
'ndcg_cut_200',
'ndcg_cut_500',
'ndcg_cut_1000',
'map_cut_5',
'map_cut_10',
'map_cut_15',
'map_cut_20',
'map_cut_30',
'map_cut_100',
'map_cut_200',
'map_cut_500',
'map_cut_1000',
'relative_P_5',
'relative_P_10',
'relative_P_15',
'relative_P_20',
'relative_P_30',
'relative_P_100',
'relative_P_200',
'relative_P_500',
'relative_P_1000',
'success_1',
'success_5',
'success_10',
'set_P',
'set_relative_P',
'set_recall',
'set_map',
'set_F',
'num_nonrel_judged_ret']
resultQLTest = resultQLTest.set_index('index')
# resultQLTest['ndcg_cut_10']

## Average of the 5-fold cross Validation Test Metric

In [42]:
np.mean((resultQLTest.loc['ndcg_cut_10']))

0.45021999999999995