In [1]:
%%capture
!pip install whoosh
!wget https://tufts.box.com/shared/static/rm3jy760ci9tpvmqk4fmb4ybgbivyphj.zip -O ./tuskegee_index.zip
!unzip ./tuskegee_index.zip

# Tuskegee Syphilis Study Simple Search Engine

In the application below, you can search for anything in the [documents on the origin and development of the Tuskegee syphilis study](https://collections.nlm.nih.gov/catalog/nlm:nlmuid-2934097R-root), recently published by the National Institutes of Health (NIH). Though these data are available, they are not accessible. There is no native tool for indexing or searching through them. Using this tool, you can begin to explore this sad episode in American history.

#### To get started, please go to "Cell > Run All"

In [2]:
from whoosh.index import open_dir
ix = open_dir("tuskegee_index")

In [3]:
import ipywidgets as widgets
from whoosh import query
from whoosh.qparser import QueryParser
from IPython.display import display, HTML, clear_output
import re

def on_button_click(b):
    query_str = search_bar.value
    parser = QueryParser("text", ix.schema)
    query = parser.parse(query_str)

    html_template = """
    <p>{hit}</p>
    <hr/>
    """.strip()

    with output:
        clear_output()
        
    with output:
        with ix.searcher() as searcher:
            results = searcher.search(query, limit=None)
            results.fragmenter.maxchars = 1500
            results.fragmenter.surround = 350
            for i, hit in enumerate(results):
                display(HTML(f"<h3>Hit {i+1} of {len(results)}</h3>"))
                display(HTML(f"<h4>{hit['title']}</h4>"))
                r = re.split('\w\.\.\.\w', hit.highlights("text").replace("\n\n", ""))
                for h in r:
                    display(HTML(html_template.format(hit=h)))
                
search_bar = widgets.Text(
    value='',
    placeholder='Search anything',
    disabled=False   
)

button = widgets.Button(description="Search")
button.on_click(on_button_click)
output = widgets.Output()

### Start searching

#### Searching tips:
* If you'd like to search for just a single term, you can enter it in the box below. 
* If you'd like to search for a phrase, you can enclose it in quotations, such as "serious complications".
* A query like "serious complications"~5 would return results where "serious" and "complications" are at most 5 words away from each other.
* AND can be used as a boolean operator and will return results where two terms are both in a passage. AND is automatically placed in a query of two words, so 'latent syphilis' is internally represented as latent AND syphilis.
* OR can be used as a boolean operator and will return results where either one of two terms are in a passage.
* NOT can be used as a boolean operator and will return results which do not include the term following the NOT.
* From these boolean operators, one can construct complex queries like: syphilis AND hospitals NOT "serious complications". This query would return results that have both syphilis and hospitals in them, but do not have "serious complications".
* Parentheses can be used to group boolean statements. For example, the query syphilis AND ("serious complications" OR  hospitals) would return results that have syphilis and either serious complications or hispitals in them. 
* If you'd like to search in a specific date range, you can specify it with the date: field. For example, year:[19500101 TO 19600101] syphilis would return results between January 1st, 1950 and January 1st, 1960 that have syphilis in them.

In [4]:
display(search_bar)
display(button)
display(output)

Text(value='', placeholder='Search anything')

Button(description='Search', style=ButtonStyle())

Output()

In [5]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')