# Setup

### Acquire Data and Queries

In [1]:
# pip install ir_datasets
import ir_datasets
dataset = ir_datasets.load("clinicaltrials/2021")

In [142]:
# for doc in dataset.docs_iter():
#     print(doc) # namedtuple<doc_id, title, condition, summary, detailed_description, eligibility>
#     break

In [58]:
# !pip install pandas
import pandas as pd
queries = pd.read_csv('queries_2021.tsv', sep='\t', header=None)
queries.columns = ['id', 'query']

### Install ES package, connect to ES service

In [5]:
# !pip install elasticsearch==7.9.1
# !pip install elasticsearch==7.17.3

In [2]:
from elasticsearch import Elasticsearch
es = Elasticsearch(HOST='http://localhost', PORT='9200')

# Load Data into ElasticSearch

In [3]:
#define mapping to store data
mapping = {
    "settings":
    {
        "index":
        {
            "number_of_shards":1,
            "number_of_replicas":1
        }
    },
    "mappings":
    {
        "properties":
        {
            "content":
            {
                "type":"text",
                "fielddata":True,
                "term_vector":"with_positions_offsets_payloads",
                "store":True,
                "analyzer":"whitespace"
            }
        }
    }
}


In [19]:
#Delete any previously created index named 'ir3'
es.indices.delete(index='ir3', ignore=[404, 400])

#Create index with the mapping defined
response_createIndex = es.indices.create(index='ir3', body=mapping)
print("Create Index Response:", response_createIndex)

Create Index Response: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'ir3'}


In [24]:
#Load data into the index

id = 0
for doc in dataset.docs_iter():
    es.index('ir3', id=id, body=doc)
    id+=1

In [171]:
#Test if the load worked
resp = es.get(index='ir3', id=0)
resp

{'_index': 'ir3',
 '_type': '_doc',
 '_id': '0',
 '_version': 1,
 '_seq_no': 1,
 '_primary_term': 1,
 'found': True,
 '_source': {'doc_id': 'NCT00000102',
  'title': 'Congenital Adrenal Hyperplasia: Calcium Channels as Therapeutic Targets',
  'condition': '',
  'summary': '\n    \n      This study will test the ability of extended release nifedipine (Procardia XL), a blood\r\n      pressure medication, to permit a decrease in the dose of glucocorticoid medication children\r\n      take to treat congenital adrenal hyperplasia (CAH).\r\n    \n  ',
  'detailed_description': '\n    \n      This protocol is designed to assess both acute and chronic effects of the calcium channel\r\n      antagonist, nifedipine, on the hypothalamic-pituitary-adrenal axis in patients with\r\n      congenital adrenal hyperplasia. The multicenter trial is composed of two phases and will\r\n      involve a double-blind, placebo-controlled parallel design. The goal of Phase I is to examine\r\n      the ability of

# Run Queries

In [167]:
qtext = queries['query'][25]
qtext

"A 45-year-old woman was referred to the emergency department with abdominal pain lasting about 4 days accompanied by nausea and 2 episodes of vomiting. The pain is localized to the epigastric region and radiates to the right upper quadrant. The pain is worsening after eating fatty food. The patient experienced similar pain twice in the past year. Her past medical history is remarkable for hypercholesterolemia and two C/sections. She has 2 children, and she is menopausal. She doesn't smoke, drink alcohol, or use illicit drugs. She is mildly febrile. Her BP is 150/85, HR 115, RR 15, T 38.2, SpO2 98% on RA. On palpation, she experiences epigastric tenderness  and  tenderness in the right upper quadrant without rebound. Bowel sounds are normal. Laboratory analysis is remarkable for elevated ESR and leukocytosis with a left shift. The ultrasound revealed several gallstones and biliary sludge. The largest gallstone is 0.7cm. Surgery consultation recommends elective cholecystectomy."

In [172]:
qtext = queries['query'][0]

ex_query ={
"size": 10000,
"query": {
"bool": {
"should": [
{"match": {"Content": qtext}}
]
,"minimum_should_match": 1,
"boost": 1.0
}
}
}

resp = es.search(index='ir3', body = ex_query)
resp

{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

In [64]:
#Check queries df
queries

Unnamed: 0,id,query
0,1,Patient is a 45-year-old man with a history of...
1,2,"48 M with a h/o HTN hyperlipidemia, bicuspid a..."
2,3,A 32 yo woman who presents following a severe ...
3,4,"This is a 44 year old female with PMH of PCOS,..."
4,5,"74M hx of CAD s/p CABG, EF 60% prior CVA (no r..."
...,...,...
70,71,The patient is a 34-year-old obese woman who c...
71,72,The patient is a 16-year-old girl recently dia...
72,73,The patient is a 3-day-old female infant with ...
73,74,The patient is a 53-year-old man complaining o...


In [170]:
for id in range(75):
    
    
    query_text = queries['query'][id]
    
    bool_query = {
        "size": 10000,
        "query": 
        {
            "bool": 
            {
                "should": 
                [
                    {
                        "match": 
                        {
                            "Content": query_text
                        }
                    }
                ],
                "minimum_should_match": 1,
                "boost": 1.0
            }
        }
    }
    
    resp = es.search(index='ir3', body=bool_query)
    print("Query ",id," got %d Hits:" % resp['hits']['total']['value'])

Query  0  got 0 Hits:
Query  1  got 0 Hits:
Query  2  got 0 Hits:
Query  3  got 0 Hits:
Query  4  got 0 Hits:
Query  5  got 0 Hits:
Query  6  got 0 Hits:
Query  7  got 0 Hits:
Query  8  got 0 Hits:
Query  9  got 0 Hits:
Query  10  got 0 Hits:
Query  11  got 0 Hits:
Query  12  got 0 Hits:
Query  13  got 0 Hits:
Query  14  got 0 Hits:
Query  15  got 0 Hits:
Query  16  got 0 Hits:
Query  17  got 0 Hits:
Query  18  got 0 Hits:
Query  19  got 0 Hits:
Query  20  got 0 Hits:
Query  21  got 0 Hits:
Query  22  got 0 Hits:
Query  23  got 0 Hits:
Query  24  got 0 Hits:
Query  25  got 0 Hits:
Query  26  got 0 Hits:
Query  27  got 0 Hits:
Query  28  got 0 Hits:
Query  29  got 0 Hits:
Query  30  got 0 Hits:
Query  31  got 0 Hits:
Query  32  got 0 Hits:
Query  33  got 0 Hits:
Query  34  got 0 Hits:
Query  35  got 0 Hits:
Query  36  got 0 Hits:
Query  37  got 0 Hits:
Query  38  got 0 Hits:
Query  39  got 0 Hits:
Query  40  got 0 Hits:
Query  41  got 0 Hits:
Query  42  got 0 Hits:
Query  43  got 0 Hits