# Setup

### Acquire Data and Queries

In [1]:
# pip install ir_datasets
import ir_datasets
dataset = ir_datasets.load("clinicaltrials/2021")

In [142]:
# for doc in dataset.docs_iter():
#     print(doc) # namedtuple<doc_id, title, condition, summary, detailed_description, eligibility>
#     break

In [58]:
# !pip install pandas
import pandas as pd
queries = pd.read_csv('queries_2021.tsv', sep='\t', header=None)
queries.columns = ['id', 'query']

### Install ES package, connect to ES service

In [5]:
# !pip install elasticsearch==7.9.1
# !pip install elasticsearch==7.17.3

In [2]:
from elasticsearch import Elasticsearch
es = Elasticsearch(HOST='http://localhost', PORT='9200')

# Load Data into ElasticSearch

In [3]:
#define mapping to store data
mapping = {
    "settings":
    {
        "index":
        {
            "number_of_shards":1,
            "number_of_replicas":1
        }
    },
    "mappings":
    {
        "properties":
        {
            "content":
            {
                "type":"text",
                "fielddata":True,
                "term_vector":"with_positions_offsets_payloads",
                "store":True,
                "analyzer":"whitespace"
            }
        }
    }
}


In [19]:
#Delete any previously created index named 'ir3'
es.indices.delete(index='ir3', ignore=[404, 400])

#Create index with the mapping defined
response_createIndex = es.indices.create(index='ir3', body=mapping)
print("Create Index Response:", response_createIndex)

Create Index Response: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'ir3'}


In [24]:
#Load data into the index

id = 0
for doc in dataset.docs_iter():
    es.index('ir3', id=id, body=doc)
    id+=1

In [171]:
#Test if the load worked
resp = es.get(index='ir3', id=0)
resp

{'_index': 'ir3',
 '_type': '_doc',
 '_id': '0',
 '_version': 1,
 '_seq_no': 1,
 '_primary_term': 1,
 'found': True,
 '_source': {'doc_id': 'NCT00000102',
  'title': 'Congenital Adrenal Hyperplasia: Calcium Channels as Therapeutic Targets',
  'condition': '',
  'summary': '\n    \n      This study will test the ability of extended release nifedipine (Procardia XL), a blood\r\n      pressure medication, to permit a decrease in the dose of glucocorticoid medication children\r\n      take to treat congenital adrenal hyperplasia (CAH).\r\n    \n  ',
  'detailed_description': '\n    \n      This protocol is designed to assess both acute and chronic effects of the calcium channel\r\n      antagonist, nifedipine, on the hypothalamic-pituitary-adrenal axis in patients with\r\n      congenital adrenal hyperplasia. The multicenter trial is composed of two phases and will\r\n      involve a double-blind, placebo-controlled parallel design. The goal of Phase I is to examine\r\n      the ability of

# Run Queries

In [189]:
#Check queries df
queries

Unnamed: 0,id,query
0,1,Patient is a 45-year-old man with a history of...
1,2,"48 M with a h/o HTN hyperlipidemia, bicuspid a..."
2,3,A 32 yo woman who presents following a severe ...
3,4,"This is a 44 year old female with PMH of PCOS,..."
4,5,"74M hx of CAD s/p CABG, EF 60% prior CVA (no r..."
...,...,...
70,71,The patient is a 34-year-old obese woman who c...
71,72,The patient is a 16-year-old girl recently dia...
72,73,The patient is a 3-day-old female infant with ...
73,74,The patient is a 53-year-old man complaining o...


### Check example query on ES

In [185]:
qtext = queries['query'][0]
qtext

'Patient is a 45-year-old man with a history of anaplastic astrocytoma of the spine complicated by severe lower extremity weakness and urinary retention s/p Foley catheter, high-dose steroids, hypertension, and chronic pain. The tumor is located in the T-L spine, unresectable anaplastic astrocytoma s/p radiation. Complicated by progressive lower extremity weakness and urinary retention. Patient initially presented with RLE weakness where his right knee gave out with difficulty walking and right anterior thigh numbness. MRI showed a spinal cord conus mass which was biopsied and found to be anaplastic astrocytoma. Therapy included field radiation t10-l1 followed by 11 cycles of temozolomide 7 days on and 7 days off. This was followed by CPT-11 Weekly x4 with Avastin Q2 weeks/ 2 weeks rest and repeat cycle.'

In [221]:
#For the query above, this prints the document ID and the score of all the 10000 hits

qtext = queries['query'][0]

ex_query ={
"size": 10000,
"query": {
"bool": {
"should": [
{"match": {"detailed_description": qtext}}
]
,"minimum_should_match": 1,
"boost": 1.0
}
}
}

resp = es.search(index='ir3', body = ex_query)

r = resp['hits']['hits']
for hit in r:
    print(hit['_id'], hit['_score'])

2747 121.18297
281211 118.12887
77341 115.93323
301409 109.383545
228162 109.16138
115278 108.085304
3264 106.902214
3617 106.7193
273159 106.28603
11251 106.13031
3460 103.85905
3052 103.07001
50472 102.102455
69117 102.01482
190709 101.20835
31514 100.072945
8370 99.95274
248942 98.61068
249793 98.26966
9505 97.83431
7781 95.78285
77953 95.622665
27994 95.19537
265919 94.982925
7865 94.96411
156380 94.44123
234327 94.392845
5138 93.94244
42789 93.493195
67779 92.44464
117996 92.28985
5650 92.06358
2754 91.930374
104632 91.642136
3512 90.83241
237853 90.79851
99130 89.77252
99257 89.77252
102279 89.631355
301846 89.5385
192753 89.53044
144311 89.4078
5307 89.274025
272010 88.50763
2800 88.47651
44007 88.44545
257836 88.43932
59673 88.058044
201873 88.058044
8707 87.74103
5891 87.640656
247458 87.33175
289672 87.23899
69858 86.94662
242363 86.8019
30207 86.77617
283668 86.75169
69080 86.43513
2996 86.35016
278336 85.48928
39985 85.446655
278899 85.30604
2989 85.27442
63891 85.248375
27

295981 54.863087
96551 54.86171
98120 54.859787
256199 54.85954
14213 54.858967
282372 54.857716
237812 54.85747
153024 54.856575
339877 54.852924
181223 54.851097
274724 54.848774
16282 54.845924
65053 54.845352
125429 54.844467
332270 54.835835
252769 54.829197
189311 54.82702
242458 54.82497
104891 54.824646
360119 54.824646
14084 54.82122
338050 54.812824
3540 54.81104
49697 54.80696
322314 54.80689
191248 54.806232
373803 54.80494
351587 54.802532
145805 54.797497
8279 54.795643
244357 54.784134
34684 54.783028
286751 54.779877
24057 54.7774
236793 54.772606
140021 54.762768
180401 54.761482
342025 54.757236
163756 54.756573
148231 54.755722
15155 54.755474
281923 54.75123
329075 54.74999
299423 54.74781
348859 54.745792
373202 54.74337
239678 54.735847
302417 54.7267
52732 54.724987
161156 54.723278
276287 54.722946
189128 54.71996
365706 54.71489
75416 54.714153
111618 54.714073
371453 54.713264
148925 54.70594
153340 54.704487
270630 54.699356
364158 54.694897
56828 54.691578
6

364706 50.054096
101840 50.052776
124209 50.050735
4044 50.049423
264782 50.04914
285451 50.04496
248943 50.044586
103613 50.04422
346106 50.042824
65242 50.042297
134096 50.041874
177212 50.04076
181034 50.038857
192231 50.036434
195424 50.035954
343011 50.03486
216510 50.034786
144839 50.032352
190389 50.03054
361177 50.023483
237833 50.021137
201699 50.020798
7882 50.016613
257330 50.01396
103285 50.01369
323064 50.012215
205656 50.011868
311310 50.010345
303714 50.00884
342768 50.007755
124690 50.00584
228026 50.004616
73995 50.003838
183841 50.003418
2990 50.003082
326555 50.00141
39169 49.999573
124932 49.99862
220565 49.99791
313402 49.997585
59732 49.995514
38266 49.99513
307082 49.994644
175854 49.994328
8460 49.992626
211948 49.991917
59303 49.98922
273076 49.987904
375062 49.98578
169992 49.983536
260088 49.982853
36092 49.981747
115666 49.979557
183633 49.9793
62381 49.976696
249183 49.974236
272379 49.97408
261144 49.973186
163190 49.970066
266409 49.96987
241965 49.967846

194181 47.216312
44202 47.21624
268973 47.215923
292754 47.214386
209692 47.213997
13909 47.21169
36704 47.211212
175318 47.210735
168156 47.209557
99766 47.209232
104383 47.208836
157027 47.20863
106529 47.208122
363127 47.206997
71979 47.20469
168179 47.202553
217001 47.200176
337357 47.199818
140968 47.19955
130232 47.199448
188387 47.199223
221458 47.19819
214156 47.197742
254230 47.19695
167019 47.19528
159708 47.19351
164437 47.191525
166448 47.19147
111704 47.19144
259274 47.190884
243638 47.190273
340278 47.189865
83195 47.189484
180907 47.187088
181975 47.18643
195882 47.185234
26239 47.184303
357072 47.184223
35111 47.184216
355337 47.184143
283049 47.183823
340943 47.17758
339473 47.17756
105744 47.17659
326940 47.176544
229526 47.174915
41741 47.17473
23872 47.17177
57302 47.171425
17908 47.170963
23345 47.169815
201138 47.16683
63686 47.16621
89313 47.165726
153875 47.1644
167346 47.16387
56933 47.16351
117650 47.162247
28880 47.159294
278027 47.158794
156628 47.156944
265

266063 45.244232
358225 45.24385
126760 45.243576
207277 45.242725
232338 45.241955
30766 45.240814
68172 45.239883
189251 45.239235
215788 45.238533
294177 45.237537
298700 45.23732
153976 45.237007
282668 45.23686
136029 45.236248
42335 45.23493
284202 45.234146
53643 45.233685
26470 45.232647
237639 45.232327
312047 45.231644
128560 45.231014
103301 45.228443
240538 45.22823
54842 45.22789
320579 45.22733
193270 45.22598
236132 45.224674
49791 45.224022
113226 45.219543
330716 45.218803
338369 45.21752
229858 45.216736
331714 45.21573
337830 45.21546
35070 45.21442
240405 45.21409
14735 45.213886
2750 45.213036
192575 45.212288
233185 45.211727
304843 45.21125
267853 45.210415
2091 45.207813
249732 45.205006
5593 45.20455
113661 45.20455
254710 45.204254
355031 45.20413
363854 45.20405
232119 45.2039
34095 45.202797
47178 45.20169
170532 45.201527
208303 45.201374
27553 45.20055
229915 45.20018
28559 45.198627
267676 45.197655
28888 45.196144
247346 45.196087
335441 45.19509
348518 

In [222]:
len(resp['hits']['hits'])

10000

In [223]:
type(resp['hits']['hits'])

list

### Run all queries


TODO - How do we persist the hits?? 

The way hits_list[] is used in the cell below should probably change. Maybe we only need the docID and the score?

In [207]:
hits_list = []

for id in range(75):
    
    query_text = queries['query'][id]
    
    bool_query = {
        "size": 10000,
        "query": 
        {
            "bool": 
            {
                "should": 
                [
                    {
                        "match": 
                        {
                            "detailed_description": query_text
                        }
                    }
                ],
                "minimum_should_match": 1,
                "boost": 1.0
            }
        }
    }
    
    resp = es.search(index='ir3', body=bool_query)
    print("Query ",id," got %d Hits" % resp['hits']['total']['value'])
    
    hits_list.append(resp['hits']['hits'])

Query  0  got 10000 Hits:
Query  1  got 10000 Hits:
Query  2  got 10000 Hits:
