# Histograms

## Related Symptoms Histogram

In [None]:
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import os

pu = os.environ['prod_u']
pp = os.environ['prod_p']


print(f'sqlalchemy version: {sqlalchemy.__version__}' )
# engine = create_engine('mysql+pymysql://root@localhost:3600/questionnaire_tagging')
p_engine = create_engine(f'mysql+pymysql://{pu}:{pp}@localhost:3307/questionnaire_tagging')
# Session = sessionmaker(bind=p_engine)
# session = Session()

# Rambam engine
# ru = os.environ['ram_u']
# rp = os.environ['ram_p']
# r_engine = create_engine(f'mysql+pymysql://{ru}:{rp}@localhost:3310/rambam')

### Build the query string

In [None]:
"""
'additional symptoms', 11
'neurological symptoms', 19
'prodromal symptoms', 73
'recent related symptoms', 257
'following symptoms', 258
"""
question_ids = ['11','19','73','257','258']
ids_list = ','.join(question_ids)
query_str = f"""
SELECT tg_res.value AS symptoms, COUNT(*) AS number_of_times FROM
    (SELECT ansr.id AS a_id, ansr.valid AS a_val, qstn.valid AS q_valid, qstn.category_id AS q_cat, qstn.id as q_id
    FROM questionnaire_tagging.questions qstn INNER JOIN questionnaire_tagging.answers ansr
    ON qstn.id = ansr.question_id
    WHERE qstn.valid = 1 AND qstn.category_id IN ({ids_list}) AND ansr.valid = 1 AND clinical_entity_id = 194) q_a
INNER JOIN questionnaire_tagging.tagging_responses tg_res
ON tg_res.value = q_a.a_id
GROUP BY tg_res.value
ORDER BY number_of_times DESC;
"""

print(query_str)

### Run the query

In [None]:
import pandas as pd

related_symtoms_as_answer_ids_df = pd.read_sql_query(query_str, p_engine)

In [None]:
display(related_symtoms_as_answer_ids_df.head(20))

### Answer/Symptom lookup table

In [None]:
query_str = """
SELECT id, name
FROM questionnaire_tagging.answers
WHERE valid=1
"""
ans_lookup_table = pd.read_sql_query(query_str, p_engine, index_col='id')

In [None]:
display(ans_lookup_table.loc[8527])

# display(ans_lookup_table['iz' in ans_lookup_table['name']])
df = ans_lookup_table.fillna('')
dizziness_df = df[df['name'].str.match('[dD]iz')]
display(dizziness_df)

### Fetch Diziness Cases

In [None]:
# Fetch visits info from rambam - to query rambam uncomment r_engine init at the head of the file.

# query_str = "SELECT * FROM full_visit_info"
# rambam_visits_df = pd.read_sql(query_str, r_engine)
# rambam_visits_df.to_pickle('./rambam_visits')
rambam_visits_df = pd.read_pickle('./rambam_visits')
# display(rambam_visits_df.head(5))


In [None]:
# Read task ids with dizzines

ids_strs = [f"'{i}'" for i in dizziness_df.index] 
diz_ids = ','.join(ids_strs)

# query_str = f"""
# SELECT DISTINCT tg_res.task_id AS ttask_id FROM
#    (SELECT ansr.id AS ansr_id, ansr.valid AS a_valid, qstn.valid AS q_valid, qstn.category_id AS q_cat, qstn.id as q_id
#     FROM questionnaire_tagging.questions qstn INNER JOIN questionnaire_tagging.answers ansr
#     ON qstn.id = ansr.question_id
#     WHERE qstn.valid = 1 AND qstn.category_id IN (11,19,73,257,258) AND ansr.valid = 1) q_a
# INNER JOIN questionnaire_tagging.tagging_responses tg_res
# ON tg_res.value = q_a.ansr_id
# WHERE tg_res.value in ({diz_ids});
# """

query_str = f"""
SELECT DISTINCT tg_res.task_id FROM
    (SELECT ansr.id AS ansr_id, ansr.valid AS a_valid, qstn.valid AS q_valid, qstn.category_id AS q_cat, qstn.id as q_id
    FROM questionnaire_tagging.questions qstn INNER JOIN questionnaire_tagging.answers ansr
    ON qstn.id = ansr.question_id
    WHERE qstn.valid = 1 AND qstn.category_id IN (11,19,73,257,258) AND ansr.valid = 1) q_a
INNER JOIN questionnaire_tagging.tagging_responses tg_res
ON tg_res.value = q_a.ansr_id
WHERE tg_res.value LIKE '%%10120%%';
"""

display(query_str)
task_ids_df = pd.read_sql(query_str, p_engine)

display(task_ids_df)
task_ids_df.to_csv('./tagging_tasks_with_diziness.csv')


### Arrange X, Y of the histogram

In [None]:
histo_dic = {}
for index, row in related_symtoms_as_answer_ids_df.iterrows():
    ans_ids = row['symptoms']
    num_of_times = row['number_of_times']
    if ';' in ans_ids:
        items = ans_ids.split(';')
        for item in items:
            if item in histo_dic:
                histo_dic[item] += num_of_times
            else:
                histo_dic[item] = num_of_times
    else:
        if ans_ids in histo_dic:
            histo_dic[ans_ids] += num_of_times
        else:
            histo_dic[ans_ids] = num_of_times

# Sort by nums
{k: v for k, v in sorted(histo_dic.items(), key=lambda item: item[1], reverse=True)}


In [None]:
histo_df = pd.DataFrame.from_dict(histo_dic, orient='index', columns=['num_of_uses'])
# display(histo_df.head(3))
# display(histo_df.index[1:4])

def getSymName(symp_id):
    symp_name = ans_lookup_table.loc[int(symp_id)]['name']
    return symp_name

histo_df['name'] = histo_df.index.map(getSymName)
# display(histo_df.head(3))
    

In [None]:
from bokeh.io import output_notebook, show
output_notebook()

In [None]:
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.plotting import figure
from bokeh.transform import factor_cmap

df = pd.DataFrame()
df['syms'] = histo_df.name.astype(str)
df['nums'] = histo_df.num_of_uses.astype(int)
df_sum = df.groupby('syms').sum()
display(df_sum)
df_srt = df_sum.sort_values('nums', ascending=False).reset_index()
display(df_srt)


In [None]:
source = ColumnDataSource(df_srt)

p = figure(plot_width=800, plot_height=800, title="Number of headache related symptoms",
           x_range=df_srt['syms'], toolbar_location=None, tools="")

p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Symptoms"
p.xaxis.major_label_orientation = 1.2

p.vbar(x='syms', top='nums', width=1, source=source,
       line_color="white", 
       hover_line_color="darkgrey")

p.add_tools(HoverTool(tooltips=[("Symptom", "@syms"), ("Uses", "@nums")]))

show(p)

## Manual Fixes to CSV

### Map tagger dianoses to Amer

In [None]:
tagger_diagnosis_mapping = {
	"headache, unspecified": "headache general",
	"unknown diagnosis": "headache general",
	"headache general": "headache general",
	"Unspecified viral infection": "headache general",
	"Head injury, unspecified": "headache general",
	"dental disorder": "headache general",
	"Contusion of face, scalp, and neck except eye(s)": "headache general",
	"Dehydration": "headache general",
	"headache secondary to hypertension": "headache general",
	"Bacterial infection, unspecified": "headache general",

	"migraine": "migraine",
	"other forms of migraine": "migraine",

	"tension type headache": "headache general",

	"cluster headache": "cluster headache",

	"Drug induced headache, not elsewhere classified": "drug induced",

	"Benign intracranial hypertension": "benign intracranial pressure",
	"hydrocephalus": "hydrocephalus",
	"increased intracranial pressure (ICP)": "increased intracranial pressure (ICP)",

	"Intracranial hemorrhage (ICH)": "SOL_CVA",
	"cva (ischemic)": "SOL_CVA",
	'Extradural hemorrhage': 'SOL_CVA',
	'Subarachnoid hemorrhage': 'SOL_CVA',
	'transient ischemic attack': 'SOL_CVA',

	'cervicalgia': 'other',
	'uveitis': 'other',
	'Unspecified disorder of the teeth and supporting structures': 'other',
	'Intracranial space-occupying lesion': 'SOL_CVA',
	"cerebral vasculitis": "cerebral vasculitis",

	"meningitis": "meningitis",
	"Unspecified viral meningitis": "meningitis",
	"encephalitis": "meningitis",
	'Other causes of encephalitis and encephalomyelitis': 'meningitis',

	"sinusitis": "sinusitis",
	"Other disease of nasal cavity and sinuses": "sinusitis",

	"brain tumor": "SOL_CVA",
	"Neoplasm of unspecified nature of brain": "SOL_CVA",

	"Other postprocedural status": "Other postprocedural status",
	"Reaction to spinal or lumbar puncture": "reaction to spinal or lumbar puncture",

	"Occipital neuralgia": "occipital nueralgia",

	"Postherpetic trigeminal neuralgia": "trigeminal nueralgia",
	"trigeminal neuralgia": "trigeminal nueralgia",

	"giant cell arteritis (GCA, Temporal arteritis)": "GCA",

	"abscess in CNS": "SOL_CVA",
	"Atypical face pain": "trigeminal nueralgia",

	"post concussion syndrome": "post concussion syndrome"}


In [None]:
tagger_diagnosis_mapping['uveitis']

In [None]:
import pandas as pd

csv_df = pd.read_csv('~/tasks_diagnoses.csv')
display(csv_df)

In [None]:
csv_df['amer_diagnosis'] = csv_df['dignosis_name'].fillna('Nan').str.replace('\t', '').apply(lambda x: tagger_diagnosis_mapping[x] if tagger_diagnosis_mapping.get(x) else x)


In [None]:
display(csv_df.tail(20))

In [None]:
csv_df.to_csv('~/tasks_diagnoses_mapped.csv')

# Q/As to Concepts

## Find the concepts for list of Q/A

In [28]:
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd
import os

cu = os.environ['con_u']
cp = os.environ['con_p']

In [29]:
engine = create_engine('mysql+pymysql://root@localhost:3306/questionnaire_tagging')
connection = engine.connect()
print('local db connected')
connection.close()

conn_str = f'mysql+pymysql://{cu}:{cp}@35.226.187.69:3306/dr_concepts'
c_engine = create_engine(conn_str)
connection = engine.connect()
print('concepts db connected')
connection.close()

local db connected
concepts db connected


In [39]:
	features_list = [
		'do you currently suffer, or have you previously suffered from any of the following?: cluster headache',
		'are any of the following relevant to your lifestyle?: I drink alcohol on a daily basis',
		'did any of the following precede the headache?: drinking alcohol', 'one_sided_pain',
		'Are you currently experiencing any of the following?: lacrimation',
		'Are you currently experiencing any of the following?: nausea',
		'Are you currently experiencing any of the following?: vomiting',
		'Are you currently experiencing any of the following?: cold or stuffy nose',
		'Are you currently experiencing any of the following?: change in eye color or appearance ']

In [44]:
def _res_array_for_col(res_proxy, col_name):
    results = []
    for row in res_proxy:
        results.append(row[col_name])

    return results


def get_q_ids(q_str):
    query_str = f"""
    SELECT question_id
    FROM questionnaire_tagging.question_translation
    WHERE title = '{q_str}';
    """
    results_proxy = engine.execute(query_str)
    results = _res_array_for_col(results_proxy, 'question_id')
    results_proxy.close()

    return results

def get_a_ids(a_str):
    query_str = f"""
    SELECT id FROM questionnaire_tagging.answers
    WHERE name = '{a_str}'
    """
    results_proxy = engine.execute(query_str)
    results = _res_array_for_col(results_proxy, 'id')
    results_proxy.close()
    if results:
        return results
    
    query_str = f"""
    SELECT answer_id FROM questionnaire_tagging.answer_translation
    WHERE title = '{a_str}'
    """
    results_proxy = engine.execute(query_str)
    results = _res_array_for_col(results_proxy, 'answer_id')
    results_proxy.close()
    return results
    

def get_concept(q_id, a_id):
    query_str = f"""
    SELECT dr_concept_id
    FROM dr_concepts.dr_concepts_questionnaire_mapping
    WHERE question_id = {q_id} AND answer_id = {a_id}
    """
    results_proxy = c_engine.execute(query_str)
    results = _res_array_for_col(results_proxy, 'dr_concept_id')
    results_proxy.close()
    
    concept_id = results[0] if results else None
    concept_name = get_concept_name(concept_id)

    return concept_name

def get_concept_name(concept_id):
    query_str = f"""
    SELECT name
    FROM dr_concepts.dr_concepts
    WHERE id = '{concept_id}'
    """
    results_proxy = c_engine.execute(query_str)
    results = _res_array_for_col(results_proxy, 'name')
    results_proxy.close()
    
    concept_id = results[0] if results else None

    return results

def find_the_concept(q_ids, a_ids):
    for aid in a_ids:
        for qid in q_ids:
            result = get_concept(qid, aid)
            if result:
                return result[0]
    return None

# def get_concept_name(concept_id):
    
    
# print(get_q_ids('does the pain radiate to any of the following?'))
# print(get_concept(766, 10125))

In [45]:
mapper = {}
for qastr in features_list:
    qa = qastr.split(': ')
    q = qa[0]
    a = qa[1] if len(qa) > 1 else None
    if a:
        q_ids = get_q_ids(q)
        a_ids = get_a_ids(a)
        if a_ids:
            concept = find_the_concept(q_ids, a_ids)
        else:
            print(f'ERROR: no answers with text: {a}')
            continue
        if concept:
#             print(f'q:{q}, a:{a} => c:{concept}')
            mapper[qastr]=concept
        else:
            print(f"{'<*>'*8} concept not found {'<*>'*8}")
            print(f"q: {q} - a: {a}")
            display(q_ids)
            display(a_ids)
            print('<*>'*20)
    else:
        print(f'ERROR - {q} has NO ANSWER')
        mapper[qastr] = q
        
display(mapper)

<*><*><*><*><*><*><*><*> concept not found <*><*><*><*><*><*><*><*>
q: are any of the following relevant to your lifestyle? - a: I drink alcohol on a daily basis


[2750, 3339]

[9235, 9681]

<*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*>
ERROR - one_sided_pain has NO ANSWER
ERROR: no answers with text: lacrimation
<*><*><*><*><*><*><*><*> concept not found <*><*><*><*><*><*><*><*>
q: Are you currently experiencing any of the following? - a: nausea


[3238]

[34,
 54,
 106,
 592,
 651,
 918,
 948,
 1058,
 1327,
 1868,
 1910,
 1992,
 2042,
 2075,
 2098,
 2119,
 2177,
 2196,
 2236,
 2553,
 2581,
 2621,
 2829,
 2955,
 2985,
 3064,
 3167,
 3192,
 3212,
 3272,
 3673,
 3696,
 3714,
 3757,
 3780,
 3819,
 4174,
 4246,
 4372,
 4402,
 4660,
 4680,
 4749,
 4989,
 5179,
 5286,
 5302,
 5488,
 5700,
 5799,
 5820,
 5844,
 5929,
 5974,
 5993,
 6030,
 6072,
 6096,
 6159,
 6262,
 6282,
 6307,
 6335,
 6427,
 6542,
 6606,
 6638,
 6670,
 6935,
 7332,
 8253,
 8624,
 8710,
 8858,
 8891,
 8901,
 9099,
 9276,
 9401,
 9674,
 9984,
 10121,
 10208]

<*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*>
<*><*><*><*><*><*><*><*> concept not found <*><*><*><*><*><*><*><*>
q: Are you currently experiencing any of the following? - a: vomiting


[3238]

[153,
 250,
 347,
 550,
 594,
 631,
 650,
 701,
 1293,
 1328,
 1423,
 1484,
 1580,
 1677,
 1774,
 2043,
 2141,
 2310,
 2407,
 2513,
 2778,
 3211,
 3226,
 3274,
 3571,
 3670,
 3693,
 3736,
 3754,
 3777,
 3798,
 3816,
 3849,
 3946,
 4042,
 4459,
 4556,
 4679,
 4694,
 4779,
 4876,
 4991,
 5033,
 5164,
 5441,
 5485,
 5525,
 5544,
 5649,
 5800,
 5936,
 5990,
 6031,
 7301,
 7306,
 7311,
 7318,
 7346,
 8252,
 8623,
 8711,
 8859,
 8892,
 8902,
 9100,
 9157,
 9162,
 9277,
 9388,
 9402,
 9475,
 9956,
 9986,
 10025,
 10209]

<*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*>
<*><*><*><*><*><*><*><*> concept not found <*><*><*><*><*><*><*><*>
q: Are you currently experiencing any of the following? - a: cold or stuffy nose


[3238]

[5147, 6041]

<*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*>
<*><*><*><*><*><*><*><*> concept not found <*><*><*><*><*><*><*><*>
q: Are you currently experiencing any of the following? - a: change in eye color or appearance 


[3238]

[9139, 9234]

<*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*><*>


{'do you currently suffer, or have you previously suffered from any of the following?: cluster headache': 'episodic cluster headache',
 'did any of the following precede the headache?: drinking alcohol': 'headache following ingestible alcohol',
 'one_sided_pain': 'one_sided_pain'}