In [20]:
import xml.etree.ElementTree as ET
import matplotlib as plt
import pandas as pd
import numpy as np
import trec
import pprint as pp
import pickle

Queries = "topics-2014_2015-summary.topics"
Qrels = "qrels-clinical_trials.txt"
with open(Queries, 'r') as queries_reader:
    txt = queries_reader.read()

root = ET.fromstring(txt)

cases = {}
for query in root.iter('TOP'):
    q_num = query.find('NUM').text
    q_title = query.find('TITLE').text
    cases[q_num] = q_title

eval = trec.TrecEvaluation(cases, Qrels)
pickle.dump(cases, open("cases.bin", "wb" ))

In [21]:
import xml.etree.ElementTree as ET
import tarfile

tar = tarfile.open("clinicaltrials.gov-16_dec_2015.tgz", "r:gz")
i = 0

doc_ids = []
brief_titles = []
detailed_descriptions = []
brief_summaries = []
criterias = []
genders = []
minimum_ages = []
maximum_ages = []


iterations = 1000
count = 0

for tarinfo in tar:
    if tarinfo.size > 500:
        txt = tar.extractfile(tarinfo).read().decode("utf-8", "strict")
        root = ET.fromstring(txt)

        judged = False
        for doc_id in root.iter('nct_id'):
            if doc_id.text in eval.judged_docs:
                judged = True
                doc_ids.append(doc_id.text.strip())
        
        if judged is False:
            continue
        i = i + 1
      
        for brief_title in root.iter('brief_title'):
            brief_titles.append(brief_title.text.strip()) #para os brief titles nao se usa o child, o texto está direto apos <brief_title>

        for detailed_description in root.iter('detailed_description'):
            for child in detailed_description:
                detailed_descriptions.append(child.text.strip()) #aqui, dentro do append temos que usar o child pq, se virem no documento dos clinical tirals, o texto detailed description esta dentro de um novo separadorzinho

        for brief_summary in root.iter('brief_summary'):
            for child in brief_summary:
                brief_summaries.append(child.text.strip())

        for criteria in root.iter('criteria'):
            for child in criteria:
                criterias.append(child.text.strip())

        for gender in root.iter('gender'):
            genders.append(gender.text.strip())

        for minimum_age in root.iter('minimum_age'):
            minimum_ages.append(minimum_age.text.strip())

        for maximum_age in root.iter('maximum_age'):
            maximum_ages.append(maximum_age.text.strip())

        if(i>1000):
            break
tar.close()


#Aqui criamos os docs pickle para cada uma das partes dos documentos
pickle.dump(doc_ids, open("doc_ids.bin", "wb" ))
pickle.dump(brief_titles, open("brief_title.bin", "wb" ))
pickle.dump(detailed_descriptions, open("detailed_description.bin", "wb" ))
pickle.dump(brief_summaries, open("brief_summary.bin", "wb" ))
pickle.dump(criterias, open("criteria.bin", "wb" ))
pickle.dump(genders, open("gender.bin", "wb" ))
pickle.dump(minimum_ages, open("minimum_age.bin", "wb" ))
pickle.dump(maximum_ages, open("maximum_age.bin", "wb" ))

FileNotFoundError: [Errno 2] No such file or directory: 'clinicaltrials.gov-16_dec_2015.tgz'

Classe RetrievalModel: definimos a classe abstrata 

In [None]:
import abc #é preciso importar isto quando queremos definir uma classe abstrata

class RetrievalModel: #vamos criar uma classe abstrata que é o molde para todas as nossas classes, cada uma um modelo

    def __init__(self, ids, docs): #aqui definimos as variaveis que entram na classe, sempre q as queremos usar temos que chamar por self.nome_da_variavel, exceto dentro do init das subclasses
        self.ids = ids
        self.docs = docs
    
    @abc.abstractmethod #para sabermos que RetrievalModel é uma classe abstrata e que, portanto, não pode ser instanciada, ie, "concretizada"
    def search(self, cases): #aqui nomeia-se uma das funcoes desta classe, neste caso, aquela onde vamos por o codigo q ordenava os docs e ainda classificava a performance do nosso modelo (junto para nao termos q mudar tanto o codigo)
        pass #nao se pode por nada aqui na abstrata, apenas em cada classe "filho" é que se define a função, aqui apenas se nomeia 


VSM Unigram

In [None]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer

class VSM(RetrievalModel): #definimos a classe de um dos modelos e pomos o RetrievalModel para dizer q esta classe é uma subclasse da classe abstrata

    def __init__(self, ids, docs):
        super().__init__(ids, docs) #aqui dizemos que ela recebe os ids e docs que a superclasse recebe. sao os mesmos!


    def search(self, cases): #aqui definimos a funcao que faz tudo o q o nosso modelo fazia, pus o codigo ca dentro, pus self.doc em vez de docs 
        index = TfidfVectorizer(ngram_range=(1,1), analyzer='word', stop_words = None)
        index.fit(self.docs)
        X = index.transform(self.docs)


        all_docscores=[]
  
        for caseid in cases:
            query = cases[caseid]
            query_tfidf = index.transform([query])
            doc_scores = 1 - pairwise_distances(X, query_tfidf, metric='cosine')
            all_docscores.append(doc_scores)

        return all_docscores
            
        
       
                 

LMJM Unigram

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from collections import Counter



class LMJM(RetrievalModel):

    def __init__(self, ids, docs):
        super().__init__(ids, docs)
    
    def search(self, cases):
        index = CountVectorizer(ngram_range=(1, 1), analyzer='word')
        X = index.fit(self.docs)
        corpus_cv = index.transform(self.docs)
        all_docscores=[]
        lmbd = 1
        
        prob_word_docs = corpus_cv/np.sum(corpus_cv, axis=1)  # p(t|md)
        prob_word_corpus = np.sum(corpus_cv, axis=0)/np.sum(corpus_cv)  # p(t|mc)
        log_mixture = np.log(lmbd*prob_word_docs + (1-lmbd)*prob_word_corpus)
        for caseid in cases:
            query = cases[caseid]
            query_cv = index.transform([query])
            total = log_mixture*query_cv.T
            all_docscores.append(total)
        return all_docscores

Chamar as classes para obtermos os valores

In [24]:
import pickle
import numpy as np

# Aqui abrimos cada documento pickle e damos-lhes os nomes para usar nas funcoes seguintes
ids = pickle.load(open("doc_ids.bin", "rb"))
brief_title = pickle.load(open("brief_title.bin", "rb"))
detailed_description = pickle.load(open("detailed_description.bin", "rb"))
brief_summary = pickle.load(open("brief_summary.bin", "rb"))
criteria = pickle.load(open("criteria.bin", "rb"))
gender = pickle.load(open("gender.bin", "rb"))
minimum_age = pickle.load(open("minimum_age.bin", "rb"))
maximum_age = pickle.load(open("maximum_age.bin", "rb"))
cases = pickle.load(open("cases.bin", "rb"))

# definimos uma lista que inclui as duas classes (VSM e LMJM), em que ambas recebem o mesmo doc_id, mas part diferente dos clinical trials por cada vez que o "for" as chama. S quiserem adicionar mais Classes que correspondam a modelos, nao se esquecam de as colocar aqui tambem!
models = [VSM(ids, brief_title), VSM(ids, detailed_description), VSM(ids, brief_summary), VSM(ids, criteria), LMJM(
    ids, brief_title), LMJM(ids, detailed_description), LMJM(ids, brief_summary), LMJM(ids, criteria)]
"""  
# doc_scores para todas as queries em cada lista dos modelos
VSM_bt_scores_raw = models[0].search(cases)
VSM_dd_scores_raw = models[1].search(cases)
VSM_bs_scores_raw = models[2].search(cases)
VSM_cr_scores_raw = models[3].search(cases)
LMJM_bt_scores_raw = models[4].search(cases)
LMJM_dd_scores_raw = models[5].search(cases)
LMJM_bs_scores_raw = models[6].search(cases)
LMJM_cr_scores_raw = models[7].search(cases)
""" 

scores_all=[]
query_doc_y_all=[]

scores_train=[]
query_doc_y_train=[]

scores_test = []
query_doc_y_test=[]


# for case in cases:
    #models.search(case) para todos os modelos
    #aux = eval.relevance_judgments.loc[eval.relevance_judgments['query_id'] == case_id] #ver quais docs sao relevantes (qrels)
    # for doc in aux
        # buscar y dessa relaçao e adaptar os q sao 2 e os q sao 1
        #pos=[i for (i,idx) in enumerate(ids) if aux.doc_id in idx] #buscar pos na lista 
        # buscar os scores em cada uma das lista, caso nao exista dar score de brief title, para pos 
        #scores.train.append([]) # append da lista de scores desse scores na lista para todas relaçoes query-doc


# confirmar que todas as listas teem o mesmo tamanho
    # juntar listas em matriz
    # retirar 20% das listar para teste

# dar matriz a log reg com y à parte (fit)







[array([[0.04577104],
       [0.0647722 ],
       [0.21058424],
       ...,
       [0.03330244],
       [0.03179586],
       [0.01358327]]), array([[0.03588125],
       [0.03039194],
       [0.06922334],
       ...,
       [0.0968189 ],
       [0.02091599],
       [0.02954132]]), array([[0.02241338],
       [0.10932929],
       [0.02043375],
       ...,
       [0.01570072],
       [0.00668279],
       [0.09152065]]), array([[0.03723858],
       [0.07523192],
       [0.10917461],
       ...,
       [0.07504446],
       [0.02528861],
       [0.03532141]]), array([[0.03743874],
       [0.04711888],
       [0.03563235],
       ...,
       [0.0291974 ],
       [0.01936208],
       [0.02174173]]), array([[0.03456992],
       [0.05302806],
       [0.03548456],
       ...,
       [0.01778185],
       [0.0151372 ],
       [0.00723278]]), array([[0.08727342],
       [0.05609767],
       [0.04650407],
       ...,
       [0.02038329],
       [0.02105149],
       [0.02042566]]), array([[0.02357598]

  res_values = method(rvalues)


Plot: average-prevision

In [23]:
import trec
Qrels = "qrels-clinical_trials.txt"

eval = trec.TrecEvaluation(cases, Qrels)
aux=eval.relevance_judgments.loc[eval.relevance_judgments['query_id']=='20141']
print('aux'+aux)

Empty DataFrame
Columns: [query_id, dummy, docid, rel]
Index: []


  res_values = method(rvalues)


In [None]:

import matplotlib.pyplot as plt
plt.plot(recall_11point,avg_precision_11point/len(cases))

NameError: name 'recall_11point' is not defined