In [10]:
import xml.etree.ElementTree as ET
import matplotlib as plt
import pandas as pd
import numpy as np
import trec
import pprint as pp
import pickle

Qrels = "qrels-clinical_trials.txt"

Queries = "topics-2014_2015-summary.topics"



with open(Queries, 'r') as queries_reader:
    txt = queries_reader.read()

root = ET.fromstring(txt)

cases = {}
for query in root.iter('TOP'):
    q_num = query.find('NUM').text
    q_title = query.find('TITLE').text
    cases[q_num] = q_title


eval = trec.TrecEvaluation(cases, Qrels)


pickle.dump(cases, open("cases.bin", "wb" ))

In [6]:
import xml.etree.ElementTree as ET
import tarfile

tar = tarfile.open("clinicaltrials.gov-16_dec_2015.tgz", "r:gz")
doc_ids = []
brief_titles = []
detailed_descriptions = []
brief_summaries = []
criterias = []
genders = []
minimum_ages = []
maximum_ages = []
#iterations = 1000
#count = 0


for tarinfo in tar:
    if tarinfo.size > 500:
        txt = tar.extractfile(tarinfo).read().decode("utf-8", "strict")
        root = ET.fromstring(txt)

        judged = False
        for doc_id in root.iter('nct_id'):
            if doc_id.text in eval.judged_docs:
                judged = True
                doc_ids.append(doc_id.text.strip())
        
        if judged is False:
            continue
      
        for brief_title in root.iter('brief_title'):
            brief_titles.append(brief_title.text.strip()) #para os brief titles nao se usa o child, o texto está direto apos <brief_title>

        for detailed_description in root.iter('detailed_description'):
            for child in detailed_description:
                detailed_descriptions.append(child.text.strip()) #aqui, dentro do append temos que usar o child pq, se virem no documento dos clinical tirals, o texto detailed description esta dentro de um novo separadorzinho

        for brief_summary in root.iter('brief_summary'):
            for child in brief_summary:
                brief_summaries.append(child.text.strip())

        for criteria in root.iter('criteria'):
            for child in criteria:
                criterias.append(child.text.strip())

        for gender in root.iter('gender'):
            genders.append(gender.text.strip())

        for minimum_age in root.iter('minimum_age'):
            minimum_ages.append(minimum_age.text.strip())

        for maximum_age in root.iter('maximum_age'):
            maximum_ages.append(maximum_age.text.strip())

        #if(i>1000):
            #break
tar.close()


#Aqui criamos os docs pickle para cada uma das partes dos documentos
pickle.dump(doc_ids, open("doc_ids.bin", "wb" ))
pickle.dump(brief_titles, open("brief_title.bin", "wb" ))
pickle.dump(detailed_descriptions, open("detailed_description.bin", "wb" ))
pickle.dump(brief_summaries, open("brief_summary.bin", "wb" ))
pickle.dump(criterias, open("criteria.bin", "wb" ))
pickle.dump(genders, open("gender.bin", "wb" ))
pickle.dump(minimum_ages, open("minimum_age.bin", "wb" ))
pickle.dump(maximum_ages, open("maximum_age.bin", "wb" ))

Classe RetrievalModel: definimos a classe abstrata 

In [51]:
import abc #é preciso importar isto quando queremos definir uma classe abstrata

class RetrievalModel: #vamos criar uma classe abstrata que é o molde para todas as nossas classes, cada uma um modelo
    @abc.abstractmethod #para sabermos que RetrievalModel é uma classe abstrata e que, portanto, não pode ser instanciada, ie, "concretizada"
    def search(self): #aqui nomeia-se uma das funcoes desta classe, neste caso, aquela onde vamos por o codigo q ordenava os docs e ainda classificava a performance do nosso modelo (junto para nao termos q mudar tanto o codigo)
        pass #nao se pode por nada aqui na abstrata, apenas em cada classe "filho" é que se define a função, aqui apenas se nomeia 


VSM Unigram

In [52]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer

class VSM(RetrievalModel): #definimos a classe de um dos modelos e pomos o RetrievalModel para dizer q esta classe é uma subclasse da classe abstrata

    def search(self, caseid, docs): #aqui definimos a funcao que faz tudo o q o nosso modelo fazia, pus o codigo ca dentro, pus self.doc em vez de docs 
        index = TfidfVectorizer(ngram_range=(1,1), analyzer='word', stop_words = None)
        index.fit(docs)
        X = index.transform(docs)
        query = cases[caseid]
        query_tfidf = index.transform([query])
        doc_scores = 1 - pairwise_distances(X, query_tfidf, metric='cosine')
        scores=doc_scores.tolist()
        return scores              

LMJM Unigram

In [53]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from collections import Counter

class LMJM(RetrievalModel):
    def search(self, caseid, docs):
        index = CountVectorizer(ngram_range=(1, 1), analyzer='word')
        X = index.fit(docs)
        corpus_cv = index.transform(docs)
        all_scores=[]
        lmbd = 1
        prob_word_docs = corpus_cv/np.sum(corpus_cv, axis=1)  # p(t|md)
        prob_word_corpus = np.sum(corpus_cv, axis=0)/np.sum(corpus_cv)  # p(t|mc)
        log_mixture = np.log(lmbd*prob_word_docs + (1-lmbd)*prob_word_corpus)
        for caseid in cases:
            query = cases[caseid]
            query_cv = index.transform([query])
            total = log_mixture*query_cv.T
            all_scores.append(total)
        return all_scores

Chamar as classes para obtermos os valores

In [65]:
import pickle
import trec
import numpy as np

# Aqui abrimos cada documento pickle e damos-lhes os nomes para usar nas funcoes seguintes
ids = pickle.load(open("doc_ids.bin", "rb"))
brief_title = pickle.load(open("brief_title.bin", "rb"))
detailed_description = pickle.load(open("detailed_description.bin", "rb"))
brief_summary = pickle.load(open("brief_summary.bin", "rb"))
criteria = pickle.load(open("criteria.bin", "rb"))
gender = pickle.load(open("gender.bin", "rb"))
minimum_age = pickle.load(open("minimum_age.bin", "rb"))
maximum_age = pickle.load(open("maximum_age.bin", "rb"))
cases = pickle.load(open("cases.bin", "rb"))


In [66]:
models = [VSM()]
#LMJM()
fields=[brief_title, detailed_description, brief_summary, criteria]

#campos sem nada-> usar brief_title
for field in fields:
    i=0
    for value in field:  
        if len(value.split()) < 2:
            print(value)
            field[i]=brief_title[i]
        i+=1        

MSD


In [67]:
#separate training and test queries
print(len(cases))
cases_training=[]
cases_test=[]
i=0
k=12
for caseid in cases:
    if i <= 11:
        cases_test.append(caseid)
    else:
        cases_training.append(caseid)
    i+=1
print(len(cases_training))
print(len(cases_test))

60
48
12


In [68]:
#buscar listas
VSM_bt=[]
VSM_dd=[]
VSM_bs=[]
VSM_cr=[]
y_rel=[]

for caseid in cases_training:
    case_rel=[]
    field_ind=0
    aux=eval.relevance_judgments.loc[eval.relevance_judgments['query_id'] == int(caseid)]
    docs= aux['docid'].tolist()
    #print(len(docs))
    relevances= aux['rel'].tolist()
    for rel in relevances:
        if rel==0:
            y_rel.append(rel)
        elif rel==1 or rel==2:
            y_rel.append(1)
    #print(len(relevances))
    for docid in docs:
        case_rel.append(ids.index(docid))
    #print(len(case_rel))
    for model in models:
        for field in fields:
            scores = model.search(caseid, field)
            if(field_ind==0):
                aux=scores
            for rel in case_rel:
                value=scores[rel] if rel < len(scores) else aux[rel]
                if field_ind==0: VSM_bt.append(value) 
                elif field_ind==1: VSM_dd.append(value)
                elif field_ind==2: VSM_bs.append(value) 
                elif field_ind==3: VSM_cr.append(value)
            field_ind+=1
print(len(VSM_bt))
print(len(VSM_dd))
print(len(VSM_bs))
print(len(VSM_cr))
print(len(y_rel))        

2920
2920
2920
2920
2920


In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

fields = [VSM_bt,
          VSM_dd,
          VSM_bs,
          VSM_cr]

VSM_bt_t = []
VSM_dd_t = []
VSM_bs_t = []
VSM_cr_t = []

field_ind = 0
for model in models:
    for field in fields:
        for val in field:
            if field_ind == 0:
                VSM_bt_t.append(val[0])
            elif field_ind == 1:
                VSM_dd_t.append(val[0])
            elif field_ind == 2:
                VSM_bs_t.append(val[0])
            elif field_ind == 3:
                VSM_cr_t.append(val[0])
        field_ind += 1

candidates = {'VSM_bt': VSM_bt_t,
              'VSM_dd': VSM_dd_t,
              'VSM_bs': VSM_bs_t,
              'VSM_cr': VSM_cr_t,
              'Y': y_rel
              }
df = pd.DataFrame(candidates, columns=[
                  'VSM_bt', 'VSM_dd', 'VSM_bs', 'VSM_cr', 'Y'])

x = df[['VSM_bt', 'VSM_dd', 'VSM_bs', 'VSM_cr']]
print(x)
y = df['Y']
print(y)

X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=0)

clf = LogisticRegression(random_state=0).fit(x, y)
print(clf.coef_)

coefs=clf.coef_


        VSM_bt    VSM_dd    VSM_bs    VSM_cr
0     0.282372  0.012251  0.205578  0.224041
1     0.011587  0.038428  0.192411  0.147885
2     0.022157  0.009642  0.018708  0.003581
3     0.008460  0.024823  0.008676  0.035941
4     0.000000  0.011476  0.005058  0.028100
...        ...       ...       ...       ...
2915  0.000000  0.000000  0.017454  0.032015
2916  0.000000  0.000000  0.013238  0.039843
2917  0.025353  0.025353  0.033201  0.035362
2918  0.121361  0.121361  0.017133  0.029767
2919  0.024516  0.024516  0.035154  0.024513

[2920 rows x 4 columns]
0       0
1       1
2       1
3       0
4       0
       ..
2915    0
2916    0
2917    0
2918    0
2919    0
Name: Y, Length: 2920, dtype: int64
[[-0.24695187 -0.5681461  -0.92322777 -1.34038771]]


In [85]:
fields=[brief_title, detailed_description, brief_summary, criteria]

VSM_bt = []
VSM_dd = []
VSM_bs = []
VSM_cr = []

for caseid in cases_test:
    case_rel=[]
    field_ind=0
    zs=[]
    aux=eval.relevance_judgments.loc[eval.relevance_judgments['query_id'] == int(caseid)]
    docs= aux['docid'].tolist()
    #print(len(docs))
    relevances= aux['rel'].tolist()
    for rel in relevances:
        if rel==0:
            y_rel.append(rel)
        elif rel==1 or rel==2:
            y_rel.append(1)
    #print(len(relevances))
    for docid in docs:
        case_rel.append(ids.index(docid))
    #print(len(case_rel))
    for model in models:
        for field in fields:
            scores = model.search(caseid, field)
            if(field_ind==0):
                aux=scores
            for rel in case_rel:
                value=scores[rel] if rel < len(scores) else aux[rel]
                if field_ind==0: VSM_bt.append(value) 
                elif field_ind==1: VSM_dd.append(value)
                elif field_ind==2: VSM_bs.append(value) 
                elif field_ind==3: VSM_cr.append(value)
            field_ind+=1
    #print(len(VSM_bt))
    #print(len(VSM_dd))
    #print(len(VSM_bs))
    #print(len(VSM_cr))
    #print(len(y_rel))
    for line in range(0,len(VSM_bt)):
        #print(line)
        z= coefs[0][0]*VSM_bt[line][0]+coefs[0][1]*VSM_dd[line][0]+coefs[0][2]*VSM_bs[line][0]+coefs[0][3]*VSM_cr[line][0]
        #print(z)
        zs.append(z)
    #print(zs)
    print(sorted(zs, reverse=False))
    break




[-0.6160150891965648, -0.054810332020369654, -0.2022081604002405, -0.14538324061905952, -0.09382904109690827, -0.07162572773696209, -0.01773447263145267, -0.028038483809670608, -0.09883389065907375, -0.3365501823187646, -0.09371326896009602, -0.19037995038397024, -0.07789469120158314, -0.12079616256006183, -0.0696416412763389, -0.15505877861614525, -0.31231827608675405, -0.2522068838720687, -0.08758021628784501, -0.14266872441861236, -0.03963897918713066, -0.1592839669008993, -0.0961767861724576, -0.09149336615007912, -0.21540432376778185, -0.09312331584651698, -0.1407138993399146, -0.14063634995080135, -0.40818498949171755, -0.13789537466632162, -0.05793845671556214, -0.16793563896890717, -0.1277480780744013, -0.06419925542568843, -0.3816899989964909, -0.13357512586455558, -0.10046616002044281, -0.26996843407878524, -0.02530791434137071, -0.2563556841747082, -0.05888621336553595, -0.14267840309837657, -0.15846237519216722, -0.08261479001930289, -0.07381313480917237, -0.100318805284711

In [None]:
# confirmar que todas as listas teem o mesmo tamanho
    # juntar listas em matriz
    # retirar 20% das listar para teste
# dar matriz a log reg com y à parte (fit)

In [None]:

import matplotlib.pyplot as plt
plt.plot(recall_11point,avg_precision_11point/len(cases))

NameError: name 'recall_11point' is not defined