# Setup

### Acquire Data and Queries

In [1]:
#Load dataset
# pip install ir_datasets
import ir_datasets
dataset = ir_datasets.load("clinicaltrials/2021")

In [142]:
#Test if dataset is loaded
# for doc in dataset.docs_iter():
#     print(doc) # namedtuple<doc_id, title, condition, summary, detailed_description, eligibility>
#     break

In [58]:
#Load queries into a df
import pandas as pd
queries = pd.read_csv('queries_2021.tsv', sep='\t', header=None)
queries.columns = ['id', 'query']

### Install ES package, connect to ES service

In [5]:
# !pip install elasticsearch==7.9.1

In [2]:
from elasticsearch import Elasticsearch
es = Elasticsearch(HOST='http://localhost', PORT='9200')

# Load Data into ElasticSearch

In [3]:
#define mapping to store data
mapping = {
    "settings":
    {
        "index":
        {
            "number_of_shards":1,
            "number_of_replicas":1
        }
    },
    "mappings":
    {
        "properties":
        {
            "content":
            {
                "type":"text",
                "fielddata":True,
                "term_vector":"with_positions_offsets_payloads",
                "store":True,
                "analyzer":"whitespace"
            }
        }
    }
}


In [19]:
#Delete any previously created index named 'ir3'
es.indices.delete(index='ir3', ignore=[404, 400])

#Create index with the mapping defined
response_createIndex = es.indices.create(index='ir3', body=mapping)
print("Create Index Response:", response_createIndex)

Create Index Response: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'ir3'}


In [24]:
#Load data into the index

id = 0
for doc in dataset.docs_iter():
    es.index('ir3', id=id, body=doc)
    id+=1

In [171]:
#Test if the load worked
resp = es.get(index='ir3', id=0)
resp

{'_index': 'ir3',
 '_type': '_doc',
 '_id': '0',
 '_version': 1,
 '_seq_no': 1,
 '_primary_term': 1,
 'found': True,
 '_source': {'doc_id': 'NCT00000102',
  'title': 'Congenital Adrenal Hyperplasia: Calcium Channels as Therapeutic Targets',
  'condition': '',
  'summary': '\n    \n      This study will test the ability of extended release nifedipine (Procardia XL), a blood\r\n      pressure medication, to permit a decrease in the dose of glucocorticoid medication children\r\n      take to treat congenital adrenal hyperplasia (CAH).\r\n    \n  ',
  'detailed_description': '\n    \n      This protocol is designed to assess both acute and chronic effects of the calcium channel\r\n      antagonist, nifedipine, on the hypothalamic-pituitary-adrenal axis in patients with\r\n      congenital adrenal hyperplasia. The multicenter trial is composed of two phases and will\r\n      involve a double-blind, placebo-controlled parallel design. The goal of Phase I is to examine\r\n      the ability of

# Run Queries

In [189]:
#Check queries df
queries

Unnamed: 0,id,query
0,1,Patient is a 45-year-old man with a history of...
1,2,"48 M with a h/o HTN hyperlipidemia, bicuspid a..."
2,3,A 32 yo woman who presents following a severe ...
3,4,"This is a 44 year old female with PMH of PCOS,..."
4,5,"74M hx of CAD s/p CABG, EF 60% prior CVA (no r..."
...,...,...
70,71,The patient is a 34-year-old obese woman who c...
71,72,The patient is a 16-year-old girl recently dia...
72,73,The patient is a 3-day-old female infant with ...
73,74,The patient is a 53-year-old man complaining o...


### Check example query on ES

In [185]:
qtext = queries['query'][0]
qtext

'Patient is a 45-year-old man with a history of anaplastic astrocytoma of the spine complicated by severe lower extremity weakness and urinary retention s/p Foley catheter, high-dose steroids, hypertension, and chronic pain. The tumor is located in the T-L spine, unresectable anaplastic astrocytoma s/p radiation. Complicated by progressive lower extremity weakness and urinary retention. Patient initially presented with RLE weakness where his right knee gave out with difficulty walking and right anterior thigh numbness. MRI showed a spinal cord conus mass which was biopsied and found to be anaplastic astrocytoma. Therapy included field radiation t10-l1 followed by 11 cycles of temozolomide 7 days on and 7 days off. This was followed by CPT-11 Weekly x4 with Avastin Q2 weeks/ 2 weeks rest and repeat cycle.'

In [225]:
#For the query above, this prints the document ID and the score of all the 10000 hits

qtext = queries['query'][0]

ex_query ={
"size": 10000,
"query": {
"bool": {
"should": [
{"match": {"detailed_description": qtext}}
]
,"minimum_should_match": 1,
"boost": 1.0
}
}
}

resp = es.search(index='ir3', body = ex_query)

r = resp['hits']['hits']
# for hit in r:
#     print(hit['_id'], hit['_score'])

##### Note: 
For the query above, we are specifically querying only the 'detailed_description' field of each document in the dataset. Check if this is correct. Should we also query the 'summary' field? 

In [230]:
#Print just the first hit (out of 10000) for the example query
r[0]

{'_index': 'ir3',
 '_type': '_doc',
 '_id': '2747',
 '_score': 121.18297,
 '_source': {'doc_id': 'NCT00003176',
  'title': 'Temozolomide and Carmustine in Treating Patients With Anaplastic Glioma',
  'condition': '',
  'summary': '\n    \n      RATIONALE: Drugs used in chemotherapy use different ways to stop tumor cells from dividing so\r\n      they stop growing or die. Combining more than one drug may kill more tumor cells.\r\n\r\n      PURPOSE: Phase II trial to study the effectiveness of temozolomide and carmustine in treating\r\n      patients with anaplastic glioma.\r\n    \n  ',
  'detailed_description': "\n    \n      OBJECTIVES: I. Evaluate the activity, measured in terms of progression free survival, of\r\n      carmustine plus temozolomide in recurrent glioblastoma. II. Estimate the response rate of\r\n      recurrent glioblastomas to this combination. III. Estimate the response rate of newly\r\n      diagnosed anaplastic astrocytomas and mixed anaplastic glioma to this comb

In [226]:
len(resp['hits']['hits'])

10000

In [227]:
type(resp['hits']['hits'])

list

### Run all queries


TODO - How do we persist the hits?? 

The way hits_list[] is used in the cell below should probably change. Maybe we only need the docID and the score?

In [207]:
hits_list = []

for id in range(75):
    
    query_text = queries['query'][id]
    
    bool_query = {
        "size": 10000,
        "query": 
        {
            "bool": 
            {
                "should": 
                [
                    {
                        "match": 
                        {
                            "detailed_description": query_text
                        }
                    }
                ],
                "minimum_should_match": 1,
                "boost": 1.0
            }
        }
    }
    
    resp = es.search(index='ir3', body=bool_query)
    print("Query ",id," got %d Hits" % resp['hits']['total']['value'])
    
    hits_list.append(resp['hits']['hits'])

Query  0  got 10000 Hits:
Query  1  got 10000 Hits:
Query  2  got 10000 Hits:
