In [1]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import warnings
import math
import time
import pickle
import codecs

In [2]:
# import os
# for file in os.listdir('../champion_folder'):
#     c_list = np.load(os.path.join('../champion_folder', file), allow_pickle=True)
#     print(c_list)
#     print(len(c_list[0][1]))

In [3]:
a=np.arange(10)
a[-5:]

array([5, 6, 7, 8, 9])

## Loading preprocessed document wise vocab

In [4]:
# vocab_doc_wise_tokenization = np.load('vocab_doc_wise_tokenization.npy', allow_pickle='TRUE').item()
vocab_doc_wise_stemming = np.load('vocab_doc_wise_stemming.npy', allow_pickle='TRUE').item()
# print(vocab_doc_wise_tokenization)
# print(vocab_doc_wise_stemming)

In [5]:
print(len(vocab_doc_wise_stemming))

5


In [6]:
def get_stemmer(stemmer_type):
    if(stemmer_type=='porter_stemmer'): stemmer = nltk.PorterStemmer()
    elif(stemmer_type=='snowball_stemmer'): stemmer = nltk.SnowballStemmer(language = 'english')
    return stemmer

In [7]:
#Choosing Snowball stemmer (advanced version of porter_stemmer)
stemmer = get_stemmer('snowball_stemmer')

## Boolean Retrieval System

In [8]:
#Creating Node which has three sub-nodes containing document ID, freq of word in that docID 
#and next to link with next docID
class Node:
    def __init__(self, docID, freq=None):
        self.docID = docID
        self.freq = freq
        self.next = None

In [9]:
#Creating word freq for each doc
def get_word_freq(vocab):
    word_freq={}
    for word in vocab:
        if word in word_freq.keys():
            word_freq[word]+=1
        else: word_freq[word]=1
    return word_freq

### Creating Postings list

In [10]:
postings_list = {}
doc_index = {}
ind=0
doc_lengths={}
for doc_id, vocab in vocab_doc_wise_stemming.items():
    word_freq = get_word_freq(vocab)
    for word, freq in word_freq.items():
        if word in postings_list.keys():
            firstNode = postings_list[word]
            while firstNode.next is not None:
                firstNode = firstNode.next
            firstNode.next = Node(ind, freq)
        else:
            postings_list[word] = Node(ind, freq)
    doc_index[ind] = doc_id
    doc_lengths[ind] = len(vocab)
    ind+=1

In [11]:
# np.save("postings_list.npy", postings_list)
# filehandler = open(b"../postings_list.pkl","wb")
# pickle.dump(postings_list,filehandler)

In [12]:
# file = open("postings_list.pkl",'rb')
# postings_list = pickle.load(file)
# file.close()
# postings_list

In [13]:
print(doc_lengths)

{0: 623, 1: 4948, 2: 13006, 3: 4788, 4: 2307}


In [40]:
# for word, node in postings_list.items():
#     print(word, end='->')
#     while node is not None:
#         print(node.docID, end='->')
#         print(node.freq, end=' ')
#         node=node.next
#     print('\n')

### Query preprocessing

Steps followed
1. Tokenize the query
2. Convert infix query expression to postfix query expression using stack approach
        a. Check if the given expression is balanced or not
        b. Check is there any extra parenthesis in the expression
3. Processing two operator only in the query **\&**(and) , **\|** (or) and **\~**(negation) and giving higeher precedence to the former
4. Using **snowball_stemmer** as a stemmer algorithm to find the stem word in the given query
5. Generate binary vector based on document size and consider negation sign as well while processing
6. Find document which contains the query word using **find_matched_doc** function and return a binary vector that shows which document contains that word
7. Remove stop words from query

In [49]:
def isnonASCII(token):
    if token in ['.','+','*','?','[','/', '//','\\','^','%',']', '$','(',')','{','}','=', '!', '|',':','-', ',', ';']:
        return True
    return False

In [15]:
def is_operator(token):
    if token in ['&' , '|']:
        return True
    return False

#Precedence of operators
def precedence_oper(token):
    if token=='&': return 2
    elif token=='|': return 1
    else: return -1

def get_postfix_list(tokens):
    stack = []
    postfix_list = []
    for token in tokens:
        #If token is left small bracket '('
        if token == '(': stack.append(token)
        elif token == ')':
            while(len(stack)>0 and stack[-1]!='('):
                postfix_list.append(stack.pop())
            if len(stack)==0 and token==')':
                raise ValueError('Either unnecessary parenthesis or Not a balanced query')
            stack.pop()
            if len(stack)>0 and stack[-1] == '(':
                raise ValueError('Either unnecessary parenthesis or Not a balanced query')
        elif is_operator(token):
            while(len(stack)>0 and precedence_oper(token) <= precedence_oper(stack[-1])):
                postfix_list.append(stack.pop())
            stack.append(token)
        else: 
            postfix_list.append(token)
    while len(stack)>0:
        postfix_list.append(stack.pop())
    return postfix_list

In [51]:
def query_preprocessing(q):
    #Remove stop words from query
    stop_words = set(stopwords.words('english'))
    #Tokenize query first
    q_tokens = word_tokenize(q)
    updated_q_tokens=[]
    connecting_words = {'and':'&','AND':'&', 'or':'|','OR':'|', 'not':'~','NOT':'~'}
    for t, token in enumerate(q_tokens):
        if token in list(connecting_words.keys()):
            if token=='not' or token=='NOT':
                if t+1>=len(q_tokens):
                    #raise ValueError("Invalid query!")
                    print("Invalid Query!!")
                    continue
                else:
                    updated_q_tokens.append('~'+q_tokens[t+1])
                    q_tokens.remove(q_tokens[t+1])
            else: updated_q_tokens.append(connecting_words[token])
        else:
            updated_q_tokens.append(token)
#     print(updated_q_tokens)
    new_q_tokens = [stemmer.stem(word.lower()) for word in updated_q_tokens if word not in stop_words and not isnonASCII(word) and len(word)>1]
#     print(new_q_tokens)
    #Convert this infix list into postfix list to process operator in right way
    q_tokens = get_postfix_list(new_q_tokens)
    return q_tokens

In [53]:
def get_binary_vec(token, postings_list, doc_size):
    word_embedd = np.zeros(doc_size, dtype=int)
    vocab = postings_list.keys()
    negation = False
    token_not_found=0
    if token[0]=='~':
        negation=True
        token=token[1:]
    if token not in vocab:
#         print("'"+token + "' was not found in the corpus")
        token_not_found=1
        return word_embedd, token_not_found
    node = postings_list[token]
    while node is not None:
        word_embedd[node.docID] = 1
        node=node.next
    if negation:
        word_embedd = np.invert(word_embedd)
    return word_embedd, token_not_found

In [55]:
def find_matched_doc(query_tokens, postings_list, doc_index, top_k):
    
    word_embedd_stack = []
    doc_size = len(doc_index)
    token_not_found=[]
    for token in query_tokens:
        if is_operator(token):
            if(len(word_embedd_stack)<2): 
                raise ValueError("Query is not correct or use more stopping words")
            first_operand = word_embedd_stack.pop()
            second_operand = word_embedd_stack.pop()
            
            if token=='&': word_embedd_stack.append(first_operand & second_operand)
            elif token=='|': word_embedd_stack.append(first_operand | second_operand)
            else:
                raise ValueError('Can\'t process this operator: ', token)
        else:
            st = stemmer.stem(token)
            
            token_embedd, flag = get_binary_vec(token, postings_list, doc_size)
            if(flag): token_not_found.append(token)
            word_embedd_stack.append(token_embedd)
    matched_doc = [doc_index[docID] for docID in np.where(word_embedd_stack[-1])[0]]
    return matched_doc, token_not_found

In [28]:
def get_score(mapped_doc, ground_truth):
    if ground_truth in mapped_doc:
        return 1
    else: return 0

In [44]:
b_query_file='boolean_query.txt'
b_query_ground_truth='boolean query answer.txt'
with codecs.open(b_query_file, mode='r', encoding='utf-8') as input_file:
    next(input_file)
    b_queries={}
    for line in input_file:
        query = line.strip().split('\t')
        b_queries[query[0]] = query[1]
print(b_queries)
with codecs.open(b_query_ground_truth, mode='r', encoding='utf-8') as input_file:
    next(input_file)
    b_queries_o_answer={}
    for line in input_file:
        query = line.strip().split('\t')
        b_queries_o_answer[query[0]] = query[1]
print(b_queries_o_answer)

{'Q02': 'Maze generation algorithm or conditional programming', 'Q03': 'Doubly linked list or stack not algorithms', 'Q04': 'Tasuku Honjo and his contriution to the society', 'Q05': 'Yelizaveta Pantueva or other Ukrainian list of young mothers', 'Q06': 'Project MKUltra and humans', 'Q07': 'Bachelor of Arts degree and opportunities available across the world', 'Q08': 'Confine itself to a single revolution of the sun, or but slightly to exceed this limit.', 'Q09': 'A rich widow lived with her daughter and her stepdaughter.', 'Q10': 'What is Overcompleteness in mathematics?', 'Q11': "The two descendants of Raghu then took hold of her feet; but remembering Gautama's words, she on her part took hold of theirs. And with a collected mind she gave them water for their feet as well as Arghya, and extended to them the rites of hospitality.", 'Q12': 'The sweet-speeched Subhadra also, saluting him in return and worshipping him repeatedly with bent head, told him all that she wished to be conveyed 

In [57]:
queries_list = b_queries ## ['person but or technology NOT', 'man or indiashow', 'daughter']
top_k=5
end_time=0
score=0
print("Top {} documents retrieved".format(top_k))
for query_idx, query in queries_list.items():
    st_time = time.time()
    query_tokens = query_preprocessing(query)
    matched_doc, token_not_found = find_matched_doc(query_tokens, postings_list, doc_index, top_k)
    end_time+=time.time()-st_time
    print("Ground truth doc: ", b_queries_o_answer[query_idx])
    score+=get_score(matched_doc[:top_k], b_queries_o_answer[query_idx])
    print(matched_doc[:top_k])
#     print("These Tokens not found in the corpus: ", token_not_found)
print("Final score: ", score)
print("Avg time takes to run Boolean Retrieval system for one query: {:.5f} sec".format(end_time/len(queries_list)))

Top 5 documents retrieved
Ground truth doc:  C00505
['D00585']
Ground truth doc:  C00515
[]
Ground truth doc:  D00003
['D00585', 'T00755']
Ground truth doc:  D00263
['P_386', 'D00585', 'L00119']
Ground truth doc:  D00022
['D00585']
Ground truth doc:  L00003
['T00921', 'D00585', 'L00119']
Ground truth doc:  L00091
['T00921', 'D00585', 'L00119', 'T00755']
Ground truth doc:  L00289
[]
Ground truth doc:  M00256
['T00921']
Ground truth doc:  P01049
['D00585']
Ground truth doc:  P_238
['T00921', 'D00585', 'L00119', 'T00755']
Ground truth doc:  R00135
[]
Ground truth doc:  R00147
[]
Ground truth doc:  R00285
['T00921']
Ground truth doc:  R00423
[]
Ground truth doc:  S00166
['D00585']
Ground truth doc:  S00267
[]
Ground truth doc:  T00169
['D00585']
Ground truth doc:  T00510
[]
Final score:  0
Avg time takes to run Boolean Retrieval system for one query: 0.00237 sec


In [20]:
vocab_doc_wise_stemming.keys()

dict_keys(['P_386', 'T00921', 'D00585', 'L00119', 'T00755'])

In [21]:
list(postings_list.keys()).index('return')

11

## Tf-Idf Retrieval System

Steps to consider
1. Tokenize the query first and remove the stopwords from the query
2. Find the query vector where each dimension represents freq. of token present in the query
3. For fast query processing, find the tf-idf for those token which are present in the query only
4. Return top-k documents only.

In [22]:
query_file='query.txt'
query_ground_truth='query answer.txt'
with codecs.open(query_file, mode='r', encoding='utf-8') as input_file:
    next(input_file)
    queries={}
    for line in input_file:
        query = line.strip().split('\t')
        queries[query[0]] = query[1]
print(queries)
with codecs.open(query_ground_truth, mode='r', encoding='utf-8') as input_file:
    next(input_file)
    queries_o_answer={}
    for line in input_file:
        query = line.strip().split('\t')
        queries_o_answer[query[0]] = query[1]
print(queries_o_answer)

{'Q02': 'Define Static and Dynamic polymorphism in c++?', 'Q03': 'Computer software has to be "loaded" into the computer\'s storage (such as the hard drive or memory). Once the software has loaded, the computer is able to execute the software. This involves passing instructions from the application software, through the system software, to the hardware which ultimately receives the instruction as machine code.', 'Q04': 'Who was Tasuku Honjo? What was his contriution to the society?', 'Q05': 'How harmful is coronavirus for animals? What are the symptoms of this virus on animals? How can we protect animals from this?', 'Q06': 'What is Project MKUltra? How is it related with humans?', 'Q07': 'What is the scope of Bachelor of Arts degree? What are the opportunities available across the world?', 'Q08': 'Epic poetry agrees with Tragedy in so far as it is an imitation in verse of characters of a higher type. They differ in that Epic poetry admits but one kind of meter and is narrative in form

In [24]:
def get_doc_tf_idf(doc, postings_list, doc_doc_index, doc_size):
    doc_vector = np.zeros(len(postings_list.keys()), dtype=int)
    for token in doc:
        if token not in postings_list.keys():
            print("'"+token + "' was not found in corpus")
        else:
            node = postings_list[token]
            get_docID_freq = 0
            dft=0
            while node is not None:
                if node.docID == doc_doc_index:
                    get_docID_freq = node.freq
                node=node.next
                dft+=1
            idx = list(postings_list.keys()).index(token)
            doc_vector[idx]=get_docID_freq * math.log(doc_size / dft)
    doc_len = np.linalg.norm(doc_vector)
    if doc_len!=0:
        doc_vector = doc_vector/doc_len
    return doc_vector

def get_query_vector(query_tokens, postings_list):
    query_vector = np.zeros(len(postings_list.keys()), dtype=int)
    for token in query_tokens:
        if token not in postings_list.keys():
            #print("'"+token + "' was not found in corpus")
            continue
        else:
            idx = list(postings_list.keys()).index(token)
            query_vector[idx] += 1
    return query_vector

def get_scoring_vec(tf_idf_matrix, doc_size):
    score_vector = np.zeros(doc_size, dtype=float)
    for _, vec in tf_idf_matrix.items():
        score_vector += vec
    return score_vector

def get_mapped_doc(score_vec, doc_index, top_k):
    doc_idx_mapping = np.arange(len(doc_index))
    get_matched_doc = [doc_index[docID] for score, docID in sorted(zip(score_vec, doc_idx_mapping), reverse=True)]
    return get_matched_doc[:top_k]

In [25]:
def get_champion_lists(vocab_doc_wise_stemming, postings_list, doc_index):
    champion_lists={token:[] for token in list(postings_list.keys())}
    doc_name=[]
    full_doc_tf_idf=[]
    for doc, doc_vocab in vocab_doc_wise_stemming.items():
        idx = list(doc_index.values()).index(doc)
        doc_tf_idf = get_doc_tf_idf(doc_vocab, postings_list, idx, len(doc_index))
        full_doc_tf_idf.append(doc_tf_idf)
        doc_name.append(doc)
    full_doc_tf_idf = np.array(full_doc_tf_idf)
#     print(doc_name)
    token_idx=0
    for token in list(champion_lists.keys()):
#         print(full_doc_tf_idf[:, token_idx])
#         print(doc_name)
        get_doc_cos_sim_vec = [(score, d_name) for score, d_name in sorted(zip(full_doc_tf_idf[:, token_idx], doc_name), reverse=True)]
        champion_lists[token] = get_doc_cos_sim_vec
#         print(get_doc_cos_sim_vec)
        token_idx+=1
    return champion_lists

In [26]:
def top_R_ranked_doc(query_tokens, champion_lists, top_r):
    top_R_doc_vec = []
    token_not_found=[]
    for token in query_tokens:
        if token not in champion_lists.keys():
            ##print("'"+token + "' was not found in corpus")
            token_not_found.append(token)
        else:
            doc_vec = champion_lists[token][:top_r]
            top_R_doc_vec = sum([], top_R_doc_vec+[score_doc_tuple for score_doc_tuple in doc_vec])
#     print(top_R_doc_vec)
    top_R_doc_vec = sorted(top_R_doc_vec, key=lambda x: x[0], reverse=True)
#     print(top_R_doc_vec)
    return top_R_doc_vec, token_not_found

In [27]:
def get_sim_score(query_tokens, V_q, vocab_doc_wise_stemming, postings_list, doc_index):
    doc_tf_idf_vector={}
    q_sim_score=[]
    q_sim_score_map_doc_name=[]
    for doc, doc_vocab in vocab_doc_wise_stemming.items():
        idx = list(doc_index.values()).index(doc)
        doc_tf_idf = get_doc_tf_idf(doc_vocab, query_tokens, postings_list, idx, len(doc_index))
#         doc_tf_idf_vector[doc] = doc_tf_idf
        
        v_d = doc_tf_idf
        if np.linalg.norm(V_q)==0 or np.linalg.norm(v_d)==0:
            cos_sim=0
        else: cos_sim = np.dot(V_q, v_d)/(np.linalg.norm(V_q) * np.linalg.norm(v_d))
        q_sim_score.append(cos_sim)
        q_sim_score_map_doc_name.append(doc)
#     print(q_sim_score)
#     print(q_sim_score_map_doc_name)
    return q_sim_score, q_sim_score_map_doc_name

In [29]:
champion_lists = get_champion_lists(vocab_doc_wise_stemming, postings_list, doc_index)
# champion_lists

In [30]:
# queries_list = ['person and technology and brahmana but not movie', 'man or indiashow', 
#                 'Treatment of otherwise healthy people is usually not needed',
#                'Soon after being discharged from the Army Laurents met ballerina Nora Kaye\
#                and the two became involved in an on again off again romantic relationship.']
queries_list = queries #list(queries.values())
top_k=5
top_r=10
end_time=0
print("Top {} documents retrieved".format(top_k))
score=0
for query_idx, query in queries_list.items():
    st_time = time.time()
    #Tokenize query first
    q_tokens = word_tokenize(query)
    #Remove stop words from query
    stop_words = set(stopwords.words('english'))
    new_q_tokens = [stemmer.stem(word.lower()) for word in q_tokens if word not in stop_words and not isnonASCII(word)]
#     print(new_q_tokens)
    V_q = get_query_vector(new_q_tokens, postings_list)
    top_r_doc_union, token_not_found = top_R_ranked_doc(new_q_tokens, champion_lists, top_r)
    mapped_doc=[]
    for score_vec_t in top_r_doc_union:
        if score_vec_t[1] not in mapped_doc:
            mapped_doc.append(score_vec_t[1])
    end_time+=time.time()-st_time
    print("Ground truth doc: ", queries_o_answer[query_idx])
    score+=get_score(mapped_doc[:top_k], queries_o_answer[query_idx])
    print("Mapped doc: ", mapped_doc[:top_k])
#     print("These Tokens not found in the corpus: ", token_not_found)
print("Final score: ", score)
print("Avg time takes to run TF-IDF Retrieval system for one query: {:.5f} sec".format(end_time/len(queries_list)))

Top 5 documents retrieved
Ground truth doc:  C00002
Mapped doc:  ['D00585', 'T00921', 'T00755', 'P_386', 'L00119']
Ground truth doc:  C00009
Mapped doc:  ['D00585', 'P_386', 'T00755', 'L00119', 'T00921']
Ground truth doc:  D00003
Mapped doc:  ['T00755', 'T00921', 'D00585', 'P_386', 'L00119']
Ground truth doc:  D00019
Mapped doc:  ['D00585', 'T00921', 'L00119', 'T00755', 'P_386']
Ground truth doc:  D00022
Mapped doc:  ['D00585', 'L00119', 'T00921', 'T00755', 'P_386']
Ground truth doc:  L00003
Mapped doc:  ['L00119', 'T00755', 'T00921', 'D00585', 'P_386']
Ground truth doc:  L00091
Mapped doc:  ['T00755', 'D00585', 'L00119', 'T00921', 'P_386']
Ground truth doc:  L00289
Mapped doc:  ['L00119', 'T00755', 'P_386', 'D00585', 'T00921']
Ground truth doc:  M00256
Mapped doc:  ['T00921', 'D00585', 'T00755', 'P_386', 'L00119']
Ground truth doc:  P01049
Mapped doc:  ['T00755', 'T00921', 'P_386', 'L00119', 'D00585']
Ground truth doc:  P_238
Mapped doc:  ['P_386', 'L00119', 'T00755', 'T00921', 'D0058

In [31]:
len(doc_index)

5

## BM25

These are the steps considered
1. Tokenize the query into tokens and remove the stop words and also remove if there's any non-ascii characters
2. Get local weight by modified term frequency formula $$\frac{(k_1+1)tf_d}{k_1(1-b+b\frac{L_d}{L_avg}) + tf_d}$$
3. Get global weight by inverse doc frequency as the priors aren't given by given formula $$\log \frac{n}{df_t}$$
4. Get RSVd score using below formula and based on this score, select top k documents $$RSVd = \sum_{\forall t \in q} \left(\log \frac{n}{df_t}\right) . \frac{(k_1+1)tf_d}{k_1(1-b+b\frac{L_d}{L_avg}) + tf_d}$$

In [32]:
def get_BM25(query_tokens, postings_list, doc_lengths, k1=1.2, b=0.75):
    #Each dimension corresponding to one document
    RSVd_vec = np.zeros(len(doc_lengths), dtype=float)
    token_not_found=[]
    for token in query_tokens:
        dft=0
        doc_vector = np.zeros(len(doc_lengths), dtype=int)
        if (token not in postings_list.keys()):
            #print("'"+token + "' was not found in corpus")
            token_not_found.append(token)
        else:
            node = postings_list[token]
            while node is not None:
                dft+=1
                doc_vector[node.docID]=node.freq
                node=node.next
        doc_vector = np.array(doc_vector, dtype=float)
#         print(doc_vector)
        if dft==0: 
            RSVd_vec += doc_vector
        else:
            #Get Local weight
            L_d = np.array(list(doc_lengths.values()), dtype=float)
            L_avg = np.mean(list(doc_lengths.values()))
            local_wt_num = (k1+1)*doc_vector
            local_wt_den = k1*(1 - b + (b/L_avg)*L_d) + doc_vector
            local_wt = np.divide(local_wt_num, local_wt_den)
#             print("local_wt: ", local_wt)
            #Get Global weight
            doc_size = len(doc_lengths)
            global_wt = math.log(doc_size / dft)
#             print("global_wt: ", global_wt)
#         print("RSVd: ", RSVd_vec)
        #Multiply local weigth with global weight
            RSVd_vec += local_wt*global_wt
    return RSVd_vec, token_not_found

In [33]:
queries_list = ['person and technology and brahmana but not movie', 'man or indiashow',
                'Treatment of otherwise healthy people is usually not needed',
               'Soon after being discharged from the Army Laurents met ballerina Nora Kaye\
               and the two became involved in an on again off again romantic relationship.']

queries_list = queries
top_k=5
end_time=0
print("Top {} documents retrieved".format(top_k))
k1 = np.random.uniform(1.2, 2.0)
b=0.75
score=0
for query_idx, query in queries_list.items():
    st_time=time.time()
    #Tokenize query first
    q_tokens = word_tokenize(query)
    #Remove stop words from query
    stop_words = set(stopwords.words('english'))
    new_q_tokens = [stemmer.stem(word.lower()) for word in q_tokens if word not in stop_words and not isnonASCII(word)]
    #print(new_q_tokens)
    RSVd_vec, token_not_found = get_BM25(new_q_tokens, postings_list, doc_lengths, k1, b)
#     print("Final RSVd_vec: ", RSVd_vec)
#     print(list(doc_index.values()))
    mapped_doc = [docName for score, docName in sorted(zip(RSVd_vec, list(doc_index.values())), reverse=True)]
    print("Ground truth doc: ", queries_o_answer[query_idx])
    score+=get_score(mapped_doc[:top_k], queries_o_answer[query_idx])
    end_time+=time.time()-st_time
    print("Mapped doc: ", mapped_doc[:top_k])
#     print("These Tokens not found in the corpus: ", token_not_found)
print("Final score: ", score)
print("Avg time takes to run BM25 Retrieval system for one query: {:.5f} sec".format(end_time/len(queries_list)))

Top 5 documents retrieved
Ground truth doc:  C00002
Mapped doc:  ['D00585', 'T00921', 'T00755', 'P_386', 'L00119']
Ground truth doc:  C00009
Mapped doc:  ['D00585', 'L00119', 'T00921', 'P_386', 'T00755']
Ground truth doc:  D00003
Mapped doc:  ['D00585', 'T00755', 'P_386', 'T00921', 'L00119']
Ground truth doc:  D00019
Mapped doc:  ['D00585', 'L00119', 'P_386', 'T00921', 'T00755']
Ground truth doc:  D00022
Mapped doc:  ['D00585', 'L00119', 'T00921', 'P_386', 'T00755']
Ground truth doc:  L00003
Mapped doc:  ['D00585', 'P_386', 'T00921', 'T00755', 'L00119']
Ground truth doc:  L00091
Mapped doc:  ['D00585', 'L00119', 'T00921', 'P_386', 'T00755']
Ground truth doc:  L00289
Mapped doc:  ['P_386', 'D00585', 'L00119', 'T00755', 'T00921']
Ground truth doc:  M00256
Mapped doc:  ['T00921', 'P_386', 'D00585', 'T00755', 'L00119']
Ground truth doc:  P01049
Mapped doc:  ['T00921', 'P_386', 'T00755', 'D00585', 'L00119']
Ground truth doc:  P_238
Mapped doc:  ['L00119', 'P_386', 'T00921', 'D00585', 'T0075