In [None]:
import xml.etree.ElementTree as ET
import matplotlib as plt
import pandas as pd
import numpy as np
import trec
import pprint as pp

# https://wiki.python.org/moin/UsingPickle
import pickle

Queries = "topics-2014_2015-summary.topics"
Qrels = "qrels-clinical_trials.txt"
with open(Queries, 'r') as queries_reader:
    txt = queries_reader.read()

root = ET.fromstring(txt)

cases = {}
for query in root.iter('TOP'):
    q_num = query.find('NUM').text
    q_title = query.find('TITLE').text
    cases[q_num] = q_title

eval = trec.TrecEvaluation(cases, Qrels)

In [None]:
import xml.etree.ElementTree as ET
import tarfile

tar = tarfile.open("clinicaltrials.gov-16_dec_2015.tgz", "r:gz")
i = 0
ids = []
docs = []

iterations = 1000
count = 0

for tarinfo in tar:
    if tarinfo.size > 500:
        txt = tar.extractfile(tarinfo).read().decode("utf-8", "strict")
        root = ET.fromstring(txt)

        judged = False
        for doc_id in root.iter('nct_id'):
            if doc_id.text in eval.judged_docs:
                judged = True

        if judged is False:
            continue

        i = i + 1
        """
        for brief_title in root.iter('brief_title'):
            count += 1
            docs.append(brief_title.text)
            ids.append(doc_id.text)
        continue
        for detailed_description in root.iter('detailed_description'):
            for child in detailed_description:
                print("detailed_description: ", child.text.strip())
        """
        for brief_summary in root.iter('brief_summary'):
            for child in brief_summary:
                docs.append(child.text.strip())
                ids.append(doc_id.text)
                #print("brief_summary: ", child.text.strip())

        #if(i>10):
            #break
tar.close()

pickle.dump(docs, open( "documents.bin", "wb" ) )
pickle.dump(ids, open( "doc_ids.bin", "wb" ) )


Cosine

Unigrams

In [None]:

docs = pickle.load( open( "documents.bin", "rb" ) )
ids = pickle.load( open( "doc_ids.bin", "rb" ) )

from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer

index = TfidfVectorizer(ngram_range=(1,1), analyzer='word', stop_words = None)
index.fit(docs)

X = index.transform(docs)

avg_precision_11point = np.zeros(11)

p10_list=[]
recall_list=[]
ap_list=[]
ndcg5_list=[]
mrr_list=[]

for caseid in cases:
    query = cases[caseid]
    query_tfidf = index.transform([query])
    doc_scores = 1 - pairwise_distances(X, query_tfidf, metric='cosine')
    
    results = pd.DataFrame(list(zip(ids, doc_scores)), columns = ['_id', 'score'])
    results_ord = results.sort_values(by=['score'], ascending = False)
    
    [p10, recall, ap, ndcg5, mrr] = eval.eval(results_ord, caseid)
    [precision_11point, recall_11point, total_relv_ret] = eval.evalPR(results_ord, caseid)

    if (np.shape(recall_11point) != (0,)):
        avg_precision_11point = avg_precision_11point + precision_11point

    p10_list+=[p10]
    recall_list+=[recall]
    ap_list+=[ap]
    ndcg5_list+=[ndcg5]
    mrr_list+=[mrr]

"""  
print(np.mean(p10_list))  
print(np.mean(recall_list)) 
print(np.mean(ap_list)) 
print(np.mean(ndcg5_list)) 
print(np.mean(mrr_list)) 
"""

Bigram

In [None]:

docs = pickle.load( open( "documents.bin", "rb" ) )
ids = pickle.load( open( "doc_ids.bin", "rb" ) )

from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer

index = TfidfVectorizer(ngram_range=(2,2), analyzer='word', stop_words = None)
index.fit(docs)

X = index.transform(docs)

avg_precision_11point = np.zeros(11)

p10_list=[]
recall_list=[]
ap_list=[]
ndcg5_list=[]
mrr_list=[]

for caseid in cases:
    query = cases[caseid]
    query_tfidf = index.transform([query])
    doc_scores = 1 - pairwise_distances(X, query_tfidf, metric='cosine')
    
    results = pd.DataFrame(list(zip(ids, doc_scores)), columns = ['_id', 'score'])
    results_ord = results.sort_values(by=['score'], ascending = False)
    
    [p10, recall, ap, ndcg5, mrr] = eval.eval(results_ord, caseid)
    [precision_11point, recall_11point, total_relv_ret] = eval.evalPR(results_ord, caseid)

    if (np.shape(recall_11point) != (0,)):
        avg_precision_11point = avg_precision_11point + precision_11point

    p10_list+=[p10]
    recall_list+=[recall]
    ap_list+=[ap]
    ndcg5_list+=[ndcg5]
    mrr_list+=[mrr]

"""
print(np.mean(p10_list))  
print(np.mean(recall_list)) 
print(np.mean(ap_list)) 
print(np.mean(ndcg5_list)) 
print(np.mean(mrr_list)) 
"""  



LMJM

Unigram

In [None]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from collections import Counter


docs = pickle.load(open("documents.bin", "rb"))
ids = pickle.load(open("doc_ids.bin", "rb"))


index = CountVectorizer(ngram_range=(1, 1), analyzer='word')
X = index.fit(docs)

corpus_cv = index.transform(docs)


lmbd = 1

prob_word_docs = corpus_cv/np.sum(corpus_cv, axis=1)  # p(t|md)
prob_word_corpus = np.sum(corpus_cv, axis=0)/np.sum(corpus_cv)  # p(t|mc)
log_mixture = np.log(lmbd*prob_word_docs + (1-lmbd)*prob_word_corpus)


for caseid in cases:
    query = cases[caseid]
    # print(query)
    query_cv = index.transform([query])
    # print(query_cv)

    total = log_mixture*query_cv.T

    results = pd.DataFrame(list(zip(ids, total)), columns=['_id', 'score'])
    results_ord = results.sort_values(by=['score'], ascending=False)
   

    [p10, recall, ap, ndcg5, mrr] = eval.eval(results_ord, caseid)
    [precision_11point, recall_11point,
        total_relv_ret] = eval.evalPR(results_ord, caseid)

    if (np.shape(recall_11point) != (0,)):
        avg_precision_11point = avg_precision_11point + precision_11point
        
    #print(p10)
    p10_list += [p10]
    recall_list += [recall]
    ap_list += [ap]
    ndcg5_list += [ndcg5]
    mrr_list += [mrr]

"""
print(np.mean(p10_list))
print(np.mean(recall_list))
print(np.mean(ap_list))
print(np.mean(ndcg5_list))
print(np.mean(mrr_list))
"""



Bigram

In [None]:

import matplotlib as plt
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from collections import Counter
docs = pickle.load(open("documents.bin", "rb"))
ids = pickle.load(open("doc_ids.bin", "rb"))


index = CountVectorizer(ngram_range=(2, 2), analyzer='word')
X = index.fit(docs)

corpus_cv = index.transform(docs)

lmbd = 1

prob_word_docs = corpus_cv/np.sum(corpus_cv, axis=1)  # p(t|md)
prob_word_corpus = np.sum(corpus_cv, axis=0)/np.sum(corpus_cv)  # p(t|mc)
log_mixture = np.log(lmbd*prob_word_docs + (1-lmbd)*prob_word_corpus)

for caseid in cases:
    query = cases[caseid]
    query_cv = index.transform([query])

    total = log_mixture*query_cv.T

    results = pd.DataFrame(list(zip(ids, total)), columns=['_id', 'score'])
    results_ord = results.sort_values(by=['score'], ascending=False)
    # print(results_ord)

    [p10, recall, ap, ndcg5, mrr] = eval.eval(results_ord, caseid)
    [precision_11point, recall_11point,
        total_relv_ret] = eval.evalPR(results_ord, caseid)

    if (np.shape(recall_11point) != (0,)):
        avg_precision_11point = avg_precision_11point + precision_11point
    #print(p10)
    p10_list += [p10]
    recall_list += [recall]
    ap_list += [ap]
    ndcg5_list += [ndcg5]
    mrr_list += [mrr]

"""
print(np.mean(p10_list))
print(np.mean(recall_list))
print(np.mean(ap_list))
print(np.mean(ndcg5_list))
print(np.mean(mrr_list))
"""

In [None]:
import matplotlib.pyplot as plt
plt.plot(recall_11point,avg_precision_11point/len(cases))
