In [7]:
import xml.etree.ElementTree as ET
import matplotlib as plt
import pandas as pd
import numpy as np
import trec
import pprint as pp
import pickle

Qrels = "qrels-clinical_trials.txt"

Queries = "topics-2014_2015-summary.topics"



with open(Queries, 'r') as queries_reader:
    txt = queries_reader.read()

root = ET.fromstring(txt)

cases = {}
for query in root.iter('TOP'):
    q_num = query.find('NUM').text
    q_title = query.find('TITLE').text
    cases[q_num] = q_title


eval = trec.TrecEvaluation(cases, Qrels)


pickle.dump(cases, open("cases.bin", "wb" ))

In [6]:
import xml.etree.ElementTree as ET
import tarfile

tar = tarfile.open("clinicaltrials.gov-16_dec_2015.tgz", "r:gz")
doc_ids = []
brief_titles = []
detailed_descriptions = []
brief_summaries = []
criterias = []
genders = []
minimum_ages = []
maximum_ages = []
#iterations = 1000
#count = 0


for tarinfo in tar:
    if tarinfo.size > 500:
        txt = tar.extractfile(tarinfo).read().decode("utf-8", "strict")
        root = ET.fromstring(txt)

        judged = False
        for doc_id in root.iter('nct_id'):
            if doc_id.text in eval.judged_docs:
                judged = True
                doc_ids.append(doc_id.text.strip())
        
        if judged is False:
            continue
      
        for brief_title in root.iter('brief_title'):
            brief_titles.append(brief_title.text.strip()) #para os brief titles nao se usa o child, o texto está direto apos <brief_title>

        for detailed_description in root.iter('detailed_description'):
            for child in detailed_description:
                detailed_descriptions.append(child.text.strip()) #aqui, dentro do append temos que usar o child pq, se virem no documento dos clinical tirals, o texto detailed description esta dentro de um novo separadorzinho

        for brief_summary in root.iter('brief_summary'):
            for child in brief_summary:
                brief_summaries.append(child.text.strip())

        for criteria in root.iter('criteria'):
            for child in criteria:
                criterias.append(child.text.strip())

        for gender in root.iter('gender'):
            genders.append(gender.text.strip())

        for minimum_age in root.iter('minimum_age'):
            minimum_ages.append(minimum_age.text.strip())

        for maximum_age in root.iter('maximum_age'):
            maximum_ages.append(maximum_age.text.strip())

        #if(i>1000):
            #break
tar.close()


#Aqui criamos os docs pickle para cada uma das partes dos documentos
pickle.dump(doc_ids, open("doc_ids.bin", "wb" ))
pickle.dump(brief_titles, open("brief_title.bin", "wb" ))
pickle.dump(detailed_descriptions, open("detailed_description.bin", "wb" ))
pickle.dump(brief_summaries, open("brief_summary.bin", "wb" ))
pickle.dump(criterias, open("criteria.bin", "wb" ))
pickle.dump(genders, open("gender.bin", "wb" ))
pickle.dump(minimum_ages, open("minimum_age.bin", "wb" ))
pickle.dump(maximum_ages, open("maximum_age.bin", "wb" ))

Classe RetrievalModel: definimos a classe abstrata 

In [1]:
import abc #é preciso importar isto quando queremos definir uma classe abstrata

class RetrievalModel: #vamos criar uma classe abstrata que é o molde para todas as nossas classes, cada uma um modelo
    @abc.abstractmethod #para sabermos que RetrievalModel é uma classe abstrata e que, portanto, não pode ser instanciada, ie, "concretizada"
    def search(self): #aqui nomeia-se uma das funcoes desta classe, neste caso, aquela onde vamos por o codigo q ordenava os docs e ainda classificava a performance do nosso modelo (junto para nao termos q mudar tanto o codigo)
        pass #nao se pode por nada aqui na abstrata, apenas em cada classe "filho" é que se define a função, aqui apenas se nomeia 


VSM Unigram

In [2]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer

class VSM(RetrievalModel): #definimos a classe de um dos modelos e pomos o RetrievalModel para dizer q esta classe é uma subclasse da classe abstrata

    def search(self, caseid, docs, case_rel): #aqui definimos a funcao que faz tudo o q o nosso modelo fazia, pus o codigo ca dentro, pus self.doc em vez de docs 
        index = TfidfVectorizer(ngram_range=(1,1), analyzer='word', stop_words = None)
        index.fit(docs)
        X = index.transform(docs)
        all_docscores=[]

        query = cases[caseid]
        query_tfidf = index.transform([query])
        doc_scores = 1 - pairwise_distances(X, query_tfidf, metric='cosine')
        scores=doc_scores.tolist()
        return scores

               

LMJM Unigram

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from collections import Counter

class LMJM(RetrievalModel):
    def search(self, caseid, docs, case_rel):
        index = CountVectorizer(ngram_range=(1, 1), analyzer='word')
        X = index.fit(docs)
        corpus_cv = index.transform(docs)
        all_scores=[]
        lmbd = 1
        prob_word_docs = corpus_cv/np.sum(corpus_cv, axis=1)  # p(t|md)
        prob_word_corpus = np.sum(corpus_cv, axis=0)/np.sum(corpus_cv)  # p(t|mc)
        log_mixture = np.log(lmbd*prob_word_docs + (1-lmbd)*prob_word_corpus)
        for caseid in cases:
            query = cases[caseid]
            query_cv = index.transform([query])
            total = log_mixture*query_cv.T
            all_scores.append(total)
        return all_scores




Chamar as classes para obtermos os valores

In [4]:
import pickle
import trec
import numpy as np

# Aqui abrimos cada documento pickle e damos-lhes os nomes para usar nas funcoes seguintes
ids = pickle.load(open("doc_ids.bin", "rb"))
brief_title = pickle.load(open("brief_title.bin", "rb"))
detailed_description = pickle.load(open("detailed_description.bin", "rb"))
brief_summary = pickle.load(open("brief_summary.bin", "rb"))
criteria = pickle.load(open("criteria.bin", "rb"))
gender = pickle.load(open("gender.bin", "rb"))
minimum_age = pickle.load(open("minimum_age.bin", "rb"))
maximum_age = pickle.load(open("maximum_age.bin", "rb"))
cases = pickle.load(open("cases.bin", "rb"))


In [5]:
models = [VSM()]
#LMJM()
fields=[brief_title, detailed_description, brief_summary, criteria]

#campos sem nada-> usar brief_title
for field in fields:
    i=0
    for value in field:  
        if len(value.split()) < 2:
            print(value)
            field[i]=brief_title[i]
        i+=1        

MSD


In [15]:
scores_all=[]
query_doc_y_all=[]

scores_train=[]
query_doc_y_train=[]

scores_test = []
query_doc_y_test=[]

VSM_bt=[]
VSM_dd=[]
VSM_bs=[]
VSM_cr=[]
y_rel=[]


for caseid in cases:
    aux=eval.relevance_judgments.loc[eval.relevance_judgments['query_id'] == int(caseid)]
    docs= aux['docid'].tolist()
    print(len(docs))
    relevances= aux['rel'].tolist()
    print(len(relevances))
    case_rel=[]
    field_ind=0
    for docid in docs:
        case_rel.append(ids.index(docid))
    print(len(case_rel))
    for model in models:
        for field in fields:
            scores = model.search(caseid, field, case_rel)
            if(field_ind==0):
                aux=scores
            for rel in case_rel:
                #print(scores[rel])
                if field_ind==0:
                    VSM_bt.append(scores[rel]) 
                elif field_ind==1:
                    #if out of bonds, usar brief title VSM_cr.append(aux[rel])
                    #VSM_dd.append(scores[rel])
                elif field_ind==2:
                    #if out of bonds, usar brief title VSM_cr.append(aux[rel])
                    VSM_bs.append(scores[rel])
                elif field_ind==3:
                    #if out of bonds, usar brief title VSM_cr.append(aux[rel])
                    VSM_cr.append(scores[rel])
                    
            print(len(VSM_bt))
            print(len(VSM_bs))
            print(len(VSM_cr))
            field_ind+=1

            #filtrar aqui scores com case_rel
            print(len(scores))
            
    y_rel.append(relevances)
        
    

143
143
143
143
0
0
3626
143
0
0
2548
143
143
0
3625
143
143
143
3625
62
62
62
205
143
143
3626
205
143
143
2548
205
205
143
3625
205
205
205
3625
99
99
99
304
205
205
3626
304
205
205
2548
304
304
205
3625
304
304
304
3625
33
33
33
337
304
304
3626
337
304
304
2548
337
337
304
3625
337
337
337
3625
80
80
80
417
337
337
3626
417
337
337
2548
417
417
337
3625
417
417
417
3625
108
108
108
525
417
417
3626
525
417
417
2548
525
525
417
3625
525
525
525
3625
153
153
153
678
525
525
3626
678
525
525
2548
678
678
525
3625
678
678
678
3625
69
69
69
747
678
678
3626
747
678
678
2548
747
747
678
3625
747
747
747
3625
48
48
48
795
747
747
3626
795
747
747
2548
795
795
747
3625
795
795
795
3625
60
60
60
855
795
795
3626
855
795
795
2548
855
855
795
3625
855
855
855
3625
42
42
42
897
855
855
3626
897
855
855
2548
897
897
855
3625
897
897
897
3625
53
53
53
950
897
897
3626
950
897
897
2548
950
950
897
3625
950
950
950
3625
63
63
63
1013
950
950
3626
1013
950
950
2548
1013
1013
950
3625
1013
1013
101

IndexError: list index out of range

In [None]:
for model in models:
    for field in fields:
        scores = model.search(cases, field)
        print(scores)
        scores_all.append(scores)
#print(scores_all)

[array([[0.        ],
       [0.019918  ],
       [0.3180676 ],
       ...,
       [0.05807098],
       [0.        ],
       [0.02616656]]), array([[0.        ],
       [0.01729926],
       [0.        ],
       ...,
       [0.14931008],
       [0.        ],
       [0.02520359]]), array([[0.        ],
       [0.29532805],
       [0.        ],
       ...,
       [0.        ],
       [0.        ],
       [0.01837654]]), array([[0.        ],
       [0.06081134],
       [0.06835363],
       ...,
       [0.11313172],
       [0.        ],
       [0.07421762]]), array([[0.        ],
       [0.01015664],
       [0.        ],
       ...,
       [0.01214177],
       [0.        ],
       [0.02244221]]), array([[0.        ],
       [0.        ],
       [0.        ],
       ...,
       [0.        ],
       [0.        ],
       [0.02464161]]), array([[0.10718197],
       [0.04167117],
       [0.        ],
       ...,
       [0.        ],
       [0.        ],
       [0.0528262 ]]), array([[0.        ]

  


[matrix([[-inf],
        [-inf],
        [-inf],
        ...,
        [-inf],
        [-inf],
        [-inf]])]


  


[matrix([[-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
        [-inf],
       

  


[matrix([[-inf],
        [-inf],
        [-inf],
        ...,
        [-inf],
        [-inf],
        [-inf]])]


  


[matrix([[-inf],
        [-inf],
        [-inf],
        ...,
        [-inf],
        [-inf],
        [-inf]])]


In [None]:
#print(aux['docid'].size)
    print(docs)
    if aux['docid'].size > 10:
        #pos=[i for (i,idx) in enumerate(ids) if aux['docid'] in idx]
        #print(pos)
        for line in aux['query_id']:
            print(line)
            #print(aux['query_id'][i])
            print(line['query_id'],line['docid'],line['rel'])
            query_doc_y_all.append([])
        #print(aux['query_id'][i],aux['docid'][i],aux['relevance'][i])

In [None]:
 #ver quais docs sao relevantes (qrels)               
           
    # for doc in aux
        #buscar y dessa relaçao e adaptar os q sao 2 e os q sao 1
        #pos=[i for (i,idx) in enumerate(ids) if aux.doc_id in idx] #buscar pos na lista 
        #buscar os scores em cada uma das lista, caso nao exista dar score de brief title, para pos 
        #scores.train.append([]) # append da lista de scores desse scores na lista para todas relaçoes query-doc
# confirmar que todas as listas teem o mesmo tamanho
    # juntar listas em matriz
    # retirar 20% das listar para teste
# dar matriz a log reg com y à parte (fit)


In [None]:

import matplotlib.pyplot as plt
plt.plot(recall_11point,avg_precision_11point/len(cases))

NameError: name 'recall_11point' is not defined