In [1]:
import xml.etree.ElementTree as ET
import matplotlib as plt
import pandas as pd
import numpy as np
import trec
import pprint as pp
import pickle

Queries = "topics-2014_2015-summary.topics"
Qrels = "qrels-clinical_trials.txt"
with open(Queries, 'r') as queries_reader:
    txt = queries_reader.read()

root = ET.fromstring(txt)

cases = {}
for query in root.iter('TOP'):
    q_num = query.find('NUM').text
    q_title = query.find('TITLE').text
    cases[q_num] = q_title

eval = trec.TrecEvaluation(cases, Qrels)
pickle.dump(cases, open("cases.bin", "wb" ))

In [21]:
import xml.etree.ElementTree as ET
import tarfile

tar = tarfile.open("clinicaltrials.gov-16_dec_2015.tgz", "r:gz")
i = 0

doc_ids = []
brief_titles = []
detailed_descriptions = []
brief_summaries = []
criterias = []
genders = []
minimum_ages = []
maximum_ages = []


iterations = 1000
count = 0

for tarinfo in tar:
    if tarinfo.size > 500:
        txt = tar.extractfile(tarinfo).read().decode("utf-8", "strict")
        root = ET.fromstring(txt)

        judged = False
        for doc_id in root.iter('nct_id'):
            if doc_id.text in eval.judged_docs:
                judged = True
                doc_ids.append(doc_id.text.strip())
        
        if judged is False:
            continue
        i = i + 1
      
        for brief_title in root.iter('brief_title'):
            brief_titles.append(brief_title.text.strip()) #para os brief titles nao se usa o child, o texto está direto apos <brief_title>

        for detailed_description in root.iter('detailed_description'):
            for child in detailed_description:
                detailed_descriptions.append(child.text.strip()) #aqui, dentro do append temos que usar o child pq, se virem no documento dos clinical tirals, o texto detailed description esta dentro de um novo separadorzinho

        for brief_summary in root.iter('brief_summary'):
            for child in brief_summary:
                brief_summaries.append(child.text.strip())

        for criteria in root.iter('criteria'):
            for child in criteria:
                criterias.append(child.text.strip())

        for gender in root.iter('gender'):
            genders.append(gender.text.strip())

        for minimum_age in root.iter('minimum_age'):
            minimum_ages.append(minimum_age.text.strip())

        for maximum_age in root.iter('maximum_age'):
            maximum_ages.append(maximum_age.text.strip())

        if(i>1000):
            break
tar.close()


#Aqui criamos os docs pickle para cada uma das partes dos documentos
pickle.dump(doc_ids, open("doc_ids.bin", "wb" ))
pickle.dump(brief_titles, open("brief_title.bin", "wb" ))
pickle.dump(detailed_descriptions, open("detailed_description.bin", "wb" ))
pickle.dump(brief_summaries, open("brief_summary.bin", "wb" ))
pickle.dump(criterias, open("criteria.bin", "wb" ))
pickle.dump(genders, open("gender.bin", "wb" ))
pickle.dump(minimum_ages, open("minimum_age.bin", "wb" ))
pickle.dump(maximum_ages, open("maximum_age.bin", "wb" ))

Classe RetrievalModel: definimos a classe abstrata 

In [28]:
import abc #é preciso importar isto quando queremos definir uma classe abstrata

class RetrievalModel: #vamos criar uma classe abstrata que é o molde para todas as nossas classes, cada uma um modelo

    def __init__(self, ids, docs): #aqui definimos as variaveis que entram na classe, sempre q as queremos usar temos que chamar por self.nome_da_variavel, exceto dentro do init das subclasses
        self.ids = ids
        self.docs = docs
    
    @abc.abstractmethod #para sabermos que RetrievalModel é uma classe abstrata e que, portanto, não pode ser instanciada, ie, "concretizada"
    def search(self, cases): #aqui nomeia-se uma das funcoes desta classe, neste caso, aquela onde vamos por o codigo q ordenava os docs e ainda classificava a performance do nosso modelo (junto para nao termos q mudar tanto o codigo)
        pass #nao se pode por nada aqui na abstrata, apenas em cada classe "filho" é que se define a função, aqui apenas se nomeia 


VSM Unigram

In [29]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer

class VSM(RetrievalModel): #definimos a classe de um dos modelos e pomos o RetrievalModel para dizer q esta classe é uma subclasse da classe abstrata

    def __init__(self, ids, docs):
        super().__init__(ids, docs) #aqui dizemos que ela recebe os ids e docs que a superclasse recebe. sao os mesmos!


    def search(self, cases): #aqui definimos a funcao que faz tudo o q o nosso modelo fazia, pus o codigo ca dentro, pus self.doc em vez de docs 
        index = TfidfVectorizer(ngram_range=(1,1), analyzer='word', stop_words = None)
        index.fit(self.docs)
        X = index.transform(self.docs)

        avg_precision_11point = np.zeros(11)
        p10_list=[]
        recall_list=[]
        ap_list=[]
        ndcg5_list=[]
        mrr_list=[]
            
        for caseid in cases:
            query = cases[caseid]
            query_tfidf = index.transform([query])
            doc_scores = 1 - pairwise_distances(X, query_tfidf, metric='cosine')
            
            results = pd.DataFrame(list(zip(ids, doc_scores)), columns = ['_id', 'score'])
            results_ord = results.sort_values(by=['score'], ascending = False)

            [p10, recall, ap, ndcg5, mrr] = eval.eval(results_ord, caseid)
            [precision_11point, recall_11point, total_relv_ret] = eval.evalPR(results_ord, caseid)

            if (np.shape(recall_11point) != (0,)):
                avg_precision_11point = avg_precision_11point + precision_11point

            p10_list+=[p10]
            recall_list+=[recall]
            ap_list+=[ap]
            ndcg5_list+=[ndcg5]
            mrr_list+=[mrr]

            return [[np.mean(p10_list), np.mean(recall_list), np.mean(ap_list), np.mean(ndcg5_list), np.mean(mrr_list), recall_11point, avg_precision_11point]]     

LMJM Unigram

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from collections import Counter



class LMJM(RetrievalModel):

    def __init__(self, ids, docs):
        super().__init__(ids, docs)
    
    def search(self, cases):
        index = CountVectorizer(ngram_range=(1, 1), analyzer='word')
        X = index.fit(self.docs)
        corpus_cv = index.transform(self.docs)

        avg_precision_11point = np.zeros(11)
        p10_list=[]
        recall_list=[]
        ap_list=[]
        ndcg5_list=[]
        mrr_list=[]
        
        lmbd = 1

        prob_word_docs = corpus_cv/np.sum(corpus_cv, axis=1)  # p(t|md)
        prob_word_corpus = np.sum(corpus_cv, axis=0)/np.sum(corpus_cv)  # p(t|mc)
        log_mixture = np.log(lmbd*prob_word_docs + (1-lmbd)*prob_word_corpus)

        for caseid in cases:
            query = cases[caseid]
            query_cv = index.transform([query])

            total = log_mixture*query_cv.T

            results = pd.DataFrame(list(zip(ids, total)), columns=['_id', 'score'])
            results_ord = results.sort_values(by=['score'], ascending=False)
        
            [p10, recall, ap, ndcg5, mrr] = eval.eval(results_ord, caseid)
            [precision_11point, recall_11point,
                total_relv_ret] = eval.evalPR(results_ord, caseid)

            if (np.shape(recall_11point) != (0,)):
                avg_precision_11point = avg_precision_11point + precision_11point
                
            #print(p10)
            p10_list += [p10]
            recall_list += [recall]
            ap_list += [ap]
            ndcg5_list += [ndcg5]
            mrr_list += [mrr]

        return [[np.mean(p10_list), np.mean(recall_list), np.mean(ap_list), np.mean(ndcg5_list), np.mean(mrr_list), recall_11point, avg_precision_11point]]     

Chamar as classes para obtermos os valores

In [31]:
#Aqui abrimos cada documento pickle e damos-lhes os nomes para usar nas funcoes seguintes
ids = pickle.load(open("doc_ids.bin", "rb" ))
brief_title = pickle.load(open("brief_title.bin", "rb" ))
detailed_description = pickle.load(open("detailed_description.bin", "rb" ))
brief_summary = pickle.load(open("brief_summary.bin", "rb" ))
criteria = pickle.load(open("criteria.bin", "rb" ))
gender = pickle.load(open("gender.bin", "rb" ))
minimum_age = pickle.load(open("minimum_age.bin", "rb" ))
maximum_age = pickle.load(open("maximum_age.bin", "rb" ))
cases =  pickle.load(open("cases.bin", "rb" ))

#Aqui definimos a lista de nomes que queremos dar a cada aplicacao do modelo. Pus nome do modelo + parte do corpus que usamos. S quiserem adicionar mais Classes que correspondam a modelos, nao se esquecam de as colocar aqui tambem!
models = ["VSM_brief_title", "VSM_detailed_description", "VSM_brief_summary", "VSM_criteria", 
"LMJM_brief_title", "LMJM_detailed_description", "LMJM_brief_summary", "LMJM_criteria"]

#Aqui fazemos uma lista com os nomes dos docs pickle que vamos usar como antes usavamos os docs
corpus_parts = [brief_title, detailed_description, brief_summary, criteria]

results = [] #Aqui criamos uma lista onde vao entrar listas que resultam do return de cada modelo (ver ultima linha da funcao search); portanto, sera uma lista A de listas B, em que cada lista B é o conjunto de resultados para a aplicacao do modelo correspondente 

for part in corpus_parts: #aqui dizemos que parte dos documentos dos clinical trials queremos usar 
    models2 = [VSM(doc_id, part ), LMJM(doc_id, part )] #definimos uma lista que inclui as duas classes (VSM e LMJM), em que ambas recebem o mesmo doc_id, mas part diferente dos clinical trials por cada vez que o "for" as chama. S quiserem adicionar mais Classes que correspondam a modelos, nao se esquecam de as colocar aqui tambem!
    for model in models2: #aqui estamos a fazer os resultados para uma part do corpus do for anterior para cada modelo na lista models2.
        results += model.search(cases)
    
for i in range(len(models)):
    print(models[i] + " : {}".format(results[i])) #aqui imprimo o nome do modelo (que esta ordenado na lista models) seguido a lista de resultados; para ver qual numero corresponde a cada resultado, na classe que define o modelo, ver o return (ultima linha) da funcao search

  log_mixture = np.log(lmbd*prob_word_docs + (1-lmbd)*prob_word_corpus)


VSM_brief_title : [0.1, 0.34146341463414637, 0.011199555432409754, 0.0, 0.013986013986013986, [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], array([0.16666667, 0.02287906, 0.0146081 , 0.01637321, 0.01409869,
       0.01409869, 0.01409869, 0.01409869, 0.01409869, 0.01409869,
       0.01409869])]
VSM_detailed_description : [0.005000000000000001, 0.30075378943556946, 0.004364644387236058, 0.009469501675629268, 0.005827505827505828, [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], array([2.25644084, 0.89071108, 0.54736405, 0.47969578, 0.46602912,
       0.44271938, 0.41984128, 0.41992078, 0.42028755, 0.42088784,
       0.4214063 ])]
VSM_brief_summary : [0.0, 0.3170731707317073, 0.006911533491474804, 0.0, 0.016927083333333332, [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], array([0.04      , 0.02180582, 0.0191582 , 0.01970144, 0.01706037,
       0.01706037, 0.01706037, 0.01706037, 0.01706037, 0.01706037,
       0.01706037])]
VSM_criteria : [0.008333333333333333, 

Plot: average-prevision

In [27]:
import matplotlib.pyplot as plt
plt.plot(recall_11point,avg_precision_11point/len(cases))

NameError: name 'recall_11point' is not defined