In [67]:
import xml.etree.ElementTree as ET
import matplotlib as plt
import pandas as pd
import numpy as np
import trec
import pprint as pp
import pickle

Queries = "topics-2014_2015-summary.topics"
Qrels = "qrels-clinical_trials.txt"
with open(Queries, 'r') as queries_reader:
    txt = queries_reader.read()

root = ET.fromstring(txt)

cases = {}
for query in root.iter('TOP'):
    q_num = query.find('NUM').text
    q_title = query.find('TITLE').text
    cases[q_num] = q_title

eval = trec.TrecEvaluation(cases, Qrels)
pickle.dump(cases, open("cases.bin", "wb" ))


In [87]:
import xml.etree.ElementTree as ET
import tarfile

tar = tarfile.open("clinicaltrials.gov-16_dec_2015.tgz", "r:gz")
i = 0

doc_ids = []
brief_titles = []
detailed_descriptions = []
brief_summaries = []
criterias = []
genders = []
minimum_ages = []
maximum_ages = []


iterations = 1000
count = 0

for tarinfo in tar:
    if tarinfo.size > 500:
        txt = tar.extractfile(tarinfo).read().decode("utf-8", "strict")
        root = ET.fromstring(txt)

        judged = False
        for doc_id in root.iter('nct_id'):
            if doc_id.text in eval.judged_docs:
                judged = True
                doc_ids.append(doc_id.text.strip())
        
        if judged is False:
            continue
        i = i + 1
      
        for brief_title in root.iter('brief_title'):
            brief_titles.append(brief_title.text.strip())

        for detailed_description in root.iter('detailed_description'):
            for child in detailed_description:
                detailed_descriptions.append(child.text.strip())

        for brief_summary in root.iter('brief_summary'):
            for child in brief_summary:
                brief_summaries.append(child.text.strip())

        for criteria in root.iter('criteria'):
            for child in criteria:
                criterias.append(child.text.strip())

        for gender in root.iter('gender'):
            genders.append(gender.text.strip())

        for minimum_age in root.iter('minimum_age'):
            minimum_ages.append(minimum_age.text.strip())

        for maximum_age in root.iter('maximum_age'):
            maximum_ages.append(maximum_age.text.strip())

        if(i>500):
            break
tar.close()


if(len(doc_ids) == 0):
    print("doc_ids")
if(len(brief_titles) == 0):
    print("brief_titles")
if(len(detailed_descriptions) == 0):
    print("detailed_descriptions")
if(len(brief_summaries) == 0):
    print("brief_summaries")
if(len(criterias) == 0):
    print("criterias")
if(len(genders) == 0):
    print("genders")
if(len(minimum_ages) == 0):
    print("minimum_ages")
if(len(maximum_ages) == 0):
    print("maximum_ages")
#Aqui criamos os docs pickle para cada uma das partes dos documentos
pickle.dump(doc_ids, open("doc_ids.bin", "wb" ))
pickle.dump(brief_titles, open("brief_title.bin", "wb" ))
pickle.dump(detailed_descriptions, open("detailed_description.bin", "wb" ))
pickle.dump(brief_summaries, open("brief_summary.bin", "wb" ))
pickle.dump(criterias, open("criteria.bin", "wb" ))
pickle.dump(genders, open("gender.bin", "wb" ))
pickle.dump(minimum_ages, open("minimum_age.bin", "wb" ))
pickle.dump(maximum_ages, open("maximum_age.bin", "wb" ))

Classe RetrievalModel: definimos a classe abstrata 

In [69]:
import abc

class RetrievalModel:

    def __init__(self, ids, docs):
        self.ids = ids
        self.docs = docs
    
    @abc.abstractmethod #para sabermos que RetrievalModel é uma classe abstrata e que, portanto, não pode ser instanciada, ie, "concretizada"
    def search(self, cases):
        pass #nao se pode por nada aqui na abstrata, apenas em cada classe "filho" é que se define a função 


VSM Unigram

In [85]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer


class VSM(RetrievalModel): #aqui pomos o RetrievalModel para dizer q esta classe é uma subclasse da classe abstrata

    def __init__(self, ids, docs):
        super().__init__(ids, docs) #aqui dizemos que ela recebe os ids e docs que a superclasse recebe. sao os mesmos!


    def search(self, cases): #aqui definimos a funcao que faz tudo o q o nosso modelo fazia 
        index = TfidfVectorizer(ngram_range=(1,1), analyzer='word', stop_words = None)
        index.fit(self.docs)
        X = index.transform(self.docs)

        avg_precision_11point = np.zeros(11)
        p10_list=[]
        recall_list=[]
        ap_list=[]
        ndcg5_list=[]
        mrr_list=[]
            
        for caseid in cases:
            print("case id is")
            print(caseid)
            query = cases[caseid]
            query_tfidf = index.transform([query])
            doc_scores = 1 - pairwise_distances(X, query_tfidf, metric='cosine')
            
            results = pd.DataFrame(list(zip(ids, doc_scores)), columns = ['_id', 'score'])
            print("dengue22 dengue")
            print(len(results))
            results_ord = results.sort_values(by=['score'], ascending = False)
            print("dengue dengue")
            print(len(results_ord))
            [p10, recall, ap, ndcg5, mrr] = eval.eval(results_ord, caseid)
            [precision_11point, recall_11point, total_relv_ret] = eval.evalPR(results_ord, caseid)

            if (np.shape(recall_11point) != (0,)):
                avg_precision_11point = avg_precision_11point + precision_11point

            p10_list+=[p10]
            recall_list+=[recall]
            ap_list+=[ap]
            ndcg5_list+=[ndcg5]
            mrr_list+=[mrr]

            return [np.mean(p10_list), np.mean(recall_list), np.mean(ap_list), np.mean(ndcg5_list), np.mean(mrr_list)]     


LMJM Unigram

In [78]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from collections import Counter



class LMJM(RetrievalModel):

    def __init__(self, ids, docs):
        super().__init__(ids, docs)
    
    def search(self, cases):
        index = CountVectorizer(ngram_range=(1, 1), analyzer='word')
        X = index.fit(self.docs)
        corpus_cv = index.transform(self.docs)

        lmbd = 1

        prob_word_docs = corpus_cv/np.sum(corpus_cv, axis=1)  # p(t|md)
        prob_word_corpus = np.sum(corpus_cv, axis=0)/np.sum(corpus_cv)  # p(t|mc)
        log_mixture = np.log(lmbd*prob_word_docs + (1-lmbd)*prob_word_corpus)

        for caseid in cases:
            query = cases[caseid]
            # print(query)
            query_cv = index.transform([query])
            # print(query_cv)

            total = log_mixture*query_cv.T

            results = pd.DataFrame(list(zip(ids, total)), columns=['_id', 'score'])
            results_ord = results.sort_values(by=['score'], ascending=False)
        

            [p10, recall, ap, ndcg5, mrr] = eval.eval(results_ord, caseid)
            [precision_11point, recall_11point,
                total_relv_ret] = eval.evalPR(results_ord, caseid)

            if (np.shape(recall_11point) != (0,)):
                avg_precision_11point = avg_precision_11point + precision_11point
                
            #print(p10)
            p10_list += [p10]
            recall_list += [recall]
            ap_list += [ap]
            ndcg5_list += [ndcg5]
            mrr_list += [mrr]

        return [np.mean(p10_list), np.mean(recall_list), np.mean(ap_list), np.mean(ndcg5_list), np.mean(mrr_list)]     


Chamar as classes para obtermos os valores

In [86]:
#Aqui abrimos cada documento pickle 
doc_id= pickle.load(open("doc_ids.bin", "rb" ))
brief_title = pickle.load(open("brief_title.bin", "rb" ))
detailed_description = pickle.load(open("detailed_description.bin", "rb" ))
brief_summary = pickle.load(open("brief_summary.bin", "rb" ))
criteria = pickle.load(open("criteria.bin", "rb" ))
gender = pickle.load(open("gender.bin", "rb" ))
minimum_age = pickle.load(open("minimum_age.bin", "rb" ))
maximum_age = pickle.load(open("maximum_age.bin", "rb" ))
cases =  pickle.load(open("cases.bin", "rb" ))
print("cases is")
print(len(cases))
corpus_parts = [brief_title, detailed_description, brief_summary, criteria]


models = ["VSM_brief_title", "VSM_detailed_description", "VSM_brief_summary", "VSM_criteria", 
"LMJM_brief_title", "LMJM_detailed_description", "LMJM_brief_summary", "LMJM_criteria"]

results = []

for part in corpus_parts:
    #print(part)
    models2 = [VSM(doc_id, part ), LMJM(doc_id, part )]
    for model in models2:
        results += model.search(cases)
    
for i in range(len(models)):
    print(models[i] + " : " + results[i])
    


cases is
60
case id is
20141
dengue22 dengue
0
dengue dengue
0


IndexError: index 0 is out of bounds for axis 0 with size 0

Plot: average-prevision

In [73]:
import matplotlib.pyplot as plt
plt.plot(recall_11point,avg_precision_11point/len(cases))

NameError: name 'recall_11point' is not defined