# Histograms

## Related Symptoms Histogram

In [None]:
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import os

pu = os.environ['prod_u']
pp = os.environ['prod_p']


print(f'sqlalchemy version: {sqlalchemy.__version__}' )
# engine = create_engine('mysql+pymysql://root@localhost:3600/questionnaire_tagging')
p_engine = create_engine(f'mysql+pymysql://{pu}:{pp}@localhost:3307/questionnaire_tagging')
# Session = sessionmaker(bind=p_engine)
# session = Session()

# Rambam engine
# ru = os.environ['ram_u']
# rp = os.environ['ram_p']
# r_engine = create_engine(f'mysql+pymysql://{ru}:{rp}@localhost:3310/rambam')

### Build the query string

In [None]:
"""
'additional symptoms', 11
'neurological symptoms', 19
'prodromal symptoms', 73
'recent related symptoms', 257
'following symptoms', 258
"""
question_ids = ['11','19','73','257','258']
ids_list = ','.join(question_ids)
query_str = f"""
SELECT tg_res.value AS symptoms, COUNT(*) AS number_of_times FROM
    (SELECT ansr.id AS a_id, ansr.valid AS a_val, qstn.valid AS q_valid, qstn.category_id AS q_cat, qstn.id as q_id
    FROM questionnaire_tagging.questions qstn INNER JOIN questionnaire_tagging.answers ansr
    ON qstn.id = ansr.question_id
    WHERE qstn.valid = 1 AND qstn.category_id IN ({ids_list}) AND ansr.valid = 1 AND clinical_entity_id = 194) q_a
INNER JOIN questionnaire_tagging.tagging_responses tg_res
ON tg_res.value = q_a.a_id
GROUP BY tg_res.value
ORDER BY number_of_times DESC;
"""

print(query_str)

### Run the query

In [None]:
import pandas as pd

related_symtoms_as_answer_ids_df = pd.read_sql_query(query_str, p_engine)

In [None]:
display(related_symtoms_as_answer_ids_df.head(20))

### Answer/Symptom lookup table

In [None]:
query_str = """
SELECT id, name
FROM questionnaire_tagging.answers
WHERE valid=1
"""
ans_lookup_table = pd.read_sql_query(query_str, p_engine, index_col='id')

In [None]:
display(ans_lookup_table.loc[8527])

# display(ans_lookup_table['iz' in ans_lookup_table['name']])
df = ans_lookup_table.fillna('')
dizziness_df = df[df['name'].str.match('[dD]iz')]
display(dizziness_df)

### Fetch Diziness Cases

In [None]:
# Fetch visits info from rambam - to query rambam uncomment r_engine init at the head of the file.

# query_str = "SELECT * FROM full_visit_info"
# rambam_visits_df = pd.read_sql(query_str, r_engine)
# rambam_visits_df.to_pickle('./rambam_visits')
rambam_visits_df = pd.read_pickle('./rambam_visits')
# display(rambam_visits_df.head(5))


In [None]:
# Read task ids with dizzines

ids_strs = [f"'{i}'" for i in dizziness_df.index] 
diz_ids = ','.join(ids_strs)

# query_str = f"""
# SELECT DISTINCT tg_res.task_id AS ttask_id FROM
#    (SELECT ansr.id AS ansr_id, ansr.valid AS a_valid, qstn.valid AS q_valid, qstn.category_id AS q_cat, qstn.id as q_id
#     FROM questionnaire_tagging.questions qstn INNER JOIN questionnaire_tagging.answers ansr
#     ON qstn.id = ansr.question_id
#     WHERE qstn.valid = 1 AND qstn.category_id IN (11,19,73,257,258) AND ansr.valid = 1) q_a
# INNER JOIN questionnaire_tagging.tagging_responses tg_res
# ON tg_res.value = q_a.ansr_id
# WHERE tg_res.value in ({diz_ids});
# """

query_str = f"""
SELECT DISTINCT tg_res.task_id FROM
    (SELECT ansr.id AS ansr_id, ansr.valid AS a_valid, qstn.valid AS q_valid, qstn.category_id AS q_cat, qstn.id as q_id
    FROM questionnaire_tagging.questions qstn INNER JOIN questionnaire_tagging.answers ansr
    ON qstn.id = ansr.question_id
    WHERE qstn.valid = 1 AND qstn.category_id IN (11,19,73,257,258) AND ansr.valid = 1) q_a
INNER JOIN questionnaire_tagging.tagging_responses tg_res
ON tg_res.value = q_a.ansr_id
WHERE tg_res.value LIKE '%%10120%%';
"""

display(query_str)
task_ids_df = pd.read_sql(query_str, p_engine)

display(task_ids_df)
task_ids_df.to_csv('./tagging_tasks_with_diziness.csv')


### Arrange X, Y of the histogram

In [None]:
histo_dic = {}
for index, row in related_symtoms_as_answer_ids_df.iterrows():
    ans_ids = row['symptoms']
    num_of_times = row['number_of_times']
    if ';' in ans_ids:
        items = ans_ids.split(';')
        for item in items:
            if item in histo_dic:
                histo_dic[item] += num_of_times
            else:
                histo_dic[item] = num_of_times
    else:
        if ans_ids in histo_dic:
            histo_dic[ans_ids] += num_of_times
        else:
            histo_dic[ans_ids] = num_of_times

# Sort by nums
{k: v for k, v in sorted(histo_dic.items(), key=lambda item: item[1], reverse=True)}


In [None]:
histo_df = pd.DataFrame.from_dict(histo_dic, orient='index', columns=['num_of_uses'])
# display(histo_df.head(3))
# display(histo_df.index[1:4])

def getSymName(symp_id):
    symp_name = ans_lookup_table.loc[int(symp_id)]['name']
    return symp_name

histo_df['name'] = histo_df.index.map(getSymName)
# display(histo_df.head(3))
    

In [None]:
from bokeh.io import output_notebook, show
output_notebook()

In [None]:
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.plotting import figure
from bokeh.transform import factor_cmap

df = pd.DataFrame()
df['syms'] = histo_df.name.astype(str)
df['nums'] = histo_df.num_of_uses.astype(int)
df_sum = df.groupby('syms').sum()
display(df_sum)
df_srt = df_sum.sort_values('nums', ascending=False).reset_index()
display(df_srt)


In [None]:
source = ColumnDataSource(df_srt)

p = figure(plot_width=800, plot_height=800, title="Number of headache related symptoms",
           x_range=df_srt['syms'], toolbar_location=None, tools="")

p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Symptoms"
p.xaxis.major_label_orientation = 1.2

p.vbar(x='syms', top='nums', width=1, source=source,
       line_color="white", 
       hover_line_color="darkgrey")

p.add_tools(HoverTool(tooltips=[("Symptom", "@syms"), ("Uses", "@nums")]))

show(p)

## Manual Fixes to CSV

### Map tagger diagnoses to Amer

In [None]:
tagger_diagnosis_mapping = {
	"headache, unspecified": "headache general",
	"unknown diagnosis": "headache general",
	"headache general": "headache general",
	"Unspecified viral infection": "headache general",
	"Head injury, unspecified": "headache general",
	"dental disorder": "headache general",
	"Contusion of face, scalp, and neck except eye(s)": "headache general",
	"Dehydration": "headache general",
	"headache secondary to hypertension": "headache general",
	"Bacterial infection, unspecified": "headache general",

	"migraine": "migraine",
	"other forms of migraine": "migraine",

	"tension type headache": "headache general",

	"cluster headache": "cluster headache",

	"Drug induced headache, not elsewhere classified": "drug induced",

	"Benign intracranial hypertension": "benign intracranial pressure",
	"hydrocephalus": "hydrocephalus",
	"increased intracranial pressure (ICP)": "increased intracranial pressure (ICP)",

	"Intracranial hemorrhage (ICH)": "SOL_CVA",
	"cva (ischemic)": "SOL_CVA",
	'Extradural hemorrhage': 'SOL_CVA',
	'Subarachnoid hemorrhage': 'SOL_CVA',
	'transient ischemic attack': 'SOL_CVA',

	'cervicalgia': 'other',
	'uveitis': 'other',
	'Unspecified disorder of the teeth and supporting structures': 'other',
	'Intracranial space-occupying lesion': 'SOL_CVA',
	"cerebral vasculitis": "cerebral vasculitis",

	"meningitis": "meningitis",
	"Unspecified viral meningitis": "meningitis",
	"encephalitis": "meningitis",
	'Other causes of encephalitis and encephalomyelitis': 'meningitis',

	"sinusitis": "sinusitis",
	"Other disease of nasal cavity and sinuses": "sinusitis",

	"brain tumor": "SOL_CVA",
	"Neoplasm of unspecified nature of brain": "SOL_CVA",

	"Other postprocedural status": "Other postprocedural status",
	"Reaction to spinal or lumbar puncture": "reaction to spinal or lumbar puncture",

	"Occipital neuralgia": "occipital nueralgia",

	"Postherpetic trigeminal neuralgia": "trigeminal nueralgia",
	"trigeminal neuralgia": "trigeminal nueralgia",

	"giant cell arteritis (GCA, Temporal arteritis)": "GCA",

	"abscess in CNS": "SOL_CVA",
	"Atypical face pain": "trigeminal nueralgia",

	"post concussion syndrome": "post concussion syndrome"}


In [None]:
tagger_diagnosis_mapping['uveitis']

In [None]:
import pandas as pd

csv_df = pd.read_csv('~/tasks_diagnoses.csv')
display(csv_df)

In [None]:
csv_df['amer_diagnosis'] = csv_df['dignosis_name'].fillna('Nan').str.replace('\t', '').apply(lambda x: tagger_diagnosis_mapping[x] if tagger_diagnosis_mapping.get(x) else x)


In [None]:
display(csv_df.tail(20))

In [None]:
csv_df.to_csv('~/tasks_diagnoses_mapped.csv')

# Q/As to Concepts

## Find the concepts for list of Q/A

In [2]:
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd
from pandas import Series
import os

cu = os.environ['con_u']
cp = os.environ['con_p']

In [3]:
engine = create_engine('mysql+pymysql://root@localhost:3306/questionnaire_tagging')
connection = engine.connect()
print('local db connected')
connection.close()

local db connected


In [4]:
conn_str = f'mysql+pymysql://{cu}:{cp}@35.226.187.69:3306/dr_concepts'
c_engine = create_engine(conn_str)
connection = engine.connect()
print('concepts db connected')
connection.close()

concepts db connected


In [None]:
	features_list = [
		'did any of the following precede the headache?: a procedure in the head area',
		'did any of the following precede the headache?: dental treatment',
		'do you currently suffer, or have you previously suffered from any of the following?: inflammation of the brain',
		'general do you currently suffer, or have you previously suffered from any of the following?: diabetes',
		'general do you currently suffer, or have you previously suffered from any of the following?: kidney disease',
		'general do you currently suffer, or have you previously suffered from any of the following?: congestive heart failure',
		'did you have a fever since the headache started?',
		'have you recently suffered from one of the following?: confusion',
		'have you recently suffered from one of the following?: seizures',
		'do you have any of the following symptoms?: change in vision',
		'do you have any of the following symptoms?: weakness on one side of face',
		'do you have any of the following symptoms?: limb weakness',
		'do you have any of the following symptoms?: speech difficulty',
		'do you have any of the following symptoms?: decreased or changed sensation in face',
		'do you have any of the following symptoms?: change in hearing',
		'do you have any of the following symptoms?: decreased or changed sensation in limb',
		'please specify where the pain is located: back of the head or the neck area']

In [None]:
def _res_array_for_col(res_proxy, col_name):
    results = []
    for row in res_proxy:
        results.append(row[col_name])

    return results


def get_q_ids(q_str):
    query_str = f"""
    SELECT question_id
    FROM questionnaire_tagging.question_translation
    WHERE title = '{q_str}';
    """
    results_proxy = engine.execute(query_str)
    results = _res_array_for_col(results_proxy, 'question_id')
    results_proxy.close()

    return results

def get_a_ids(a_str):
    query_str = f"""
    SELECT id FROM questionnaire_tagging.answers
    WHERE name = '{a_str}'
    """
    results_proxy = engine.execute(query_str)
    results = _res_array_for_col(results_proxy, 'id')
    results_proxy.close()
    if results:
        return results
    
    query_str = f"""
    SELECT answer_id FROM questionnaire_tagging.answer_translation
    WHERE title = '{a_str}'
    """
    results_proxy = engine.execute(query_str)
    results = _res_array_for_col(results_proxy, 'answer_id')
    results_proxy.close()
    return results
    

def get_concept(q_id, a_id):
    query_str = f"""
    SELECT dr_concept_id
    FROM dr_concepts.dr_concepts_questionnaire_mapping
    WHERE question_id = {q_id} AND answer_id = {a_id}
    """
    results_proxy = c_engine.execute(query_str)
    results = _res_array_for_col(results_proxy, 'dr_concept_id')
    results_proxy.close()
    
    concept_id = results[0] if results else None
    concept_name = get_concept_name(concept_id)

    return concept_name

def get_concept_name(concept_id):
    query_str = f"""
    SELECT name
    FROM dr_concepts.dr_concepts
    WHERE id = '{concept_id}'
    """
    results_proxy = c_engine.execute(query_str)
    results = _res_array_for_col(results_proxy, 'name')
    results_proxy.close()
    
    concept_id = results[0] if results else None

    return results

def find_the_concept(q_ids, a_ids):
    for aid in a_ids:
        for qid in q_ids:
            result = get_concept(qid, aid)
            if result:
                return result[0]
    return None

# def get_concept_name(concept_id):
    
    
# print(get_q_ids('does the pain radiate to any of the following?'))
# print(get_concept(766, 10125))

In [None]:
mapper = {}
concepts_arr = []
for qastr in features_list:
    qa = qastr.split(': ')
    q = qa[0]
    a = qa[1] if len(qa) > 1 else None
    if a:
        q_ids = get_q_ids(q)
        a_ids = get_a_ids(a)
        if a_ids:
            concept = find_the_concept(q_ids, a_ids)
        else:
            print(f'ERROR: no answers with text: {a}')
            concepts_arr.append('')
            continue
        if concept:
#             print(f'q:{q}, a:{a} => c:{concept}')
            mapper[qastr]=concept
            concepts_arr.append(concept)
        else:
            print(f"{'<*>'*8} concept not found {'<*>'*8}")
            print(f"q: {q} - a: {a}")
            display(q_ids)
            display(a_ids)
            print('<*>'*20)
            concepts_arr.append('')
    else:
        print(f'ERROR - {q} has NO ANSWER')
        mapper[qastr] = q
        concepts_arr.append('')
        
        
# display(mapper)
display(concepts_arr)

## Related Symptoms Negations

Get all questions / answers + the symptoms of the questions (clinical entity ids) and tagging task id. Also symptom name (clinical_entity name)

In [5]:
query_str = """
SELECT DISTINCT tg_res.task_id, cli_ent.id as symptom_id, cli_ent.name as symptom_name, q_a.q_id as question, q_a.ansr_id as answer, tg_res.value as response FROM
    (SELECT ansr.id AS ansr_id, ansr.valid AS a_valid, qstn.valid AS q_valid, qstn.category_id AS q_cat, qstn.id as q_id, qstn.clinical_entity_id as cli_ent_id
    FROM questionnaire_tagging.questions qstn INNER JOIN questionnaire_tagging.answers ansr
    ON qstn.id = ansr.question_id) q_a
left join clinical_entities cli_ent on cli_ent.id = q_a.cli_ent_id
INNER JOIN questionnaire_tagging.tagging_responses tg_res
ON tg_res.value = q_a.ansr_id"""

df_taskid_vs_symptoms = pd.read_sql_query(query_str,engine) # , index_col='task_id')

  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result =

Now get mappings to the symptoms

In [6]:
display(df_taskid_vs_symptoms[df_taskid_vs_symptoms['response']=='5087;5088;5090'].head(3))

Unnamed: 0,task_id,symptom_id,symptom_name,question,answer,response
221,197.0,1451,general questions,1482,5087,5087;5088;5090
940,758.0,1451,general questions,1482,5087,5087;5088;5090
1446,465.0,1451,general questions,1482,5087,5087;5088;5090


In [7]:
import pandas as pd

df = pd.DataFrame( {"name" : ["John", "Eric"], 
               "days" : [[1, 3, 5, 7], [2,4]]})
display(df)
result = pd.DataFrame([(d, tup.name) for tup in df.itertuples() for d in tup.days])
display(result)


Unnamed: 0,name,days
0,John,"[1, 3, 5, 7]"
1,Eric,"[2, 4]"


Unnamed: 0,0,1
0,1,John
1,3,John
2,5,John
3,7,John
4,2,Eric
5,4,Eric


In [8]:
df_flat_tskids_vs_symps = pd.concat([Series(ix, row['response'].split(';'))              
                    for ix, row in df_taskid_vs_symptoms.iterrows()]).reset_index()


df_flat_tskids_vs_symps = pd.DataFrame([(tup.task_id, tup.symptom_id, tup.symptom_name, tup.question, tup.answer, resp) for tup in df_taskid_vs_symptoms.itertuples() for resp in tup.response.split(';')])

In [9]:
df_flat_tskids_vs_symps.rename(mapper={0:'task_id', 1:'symptom_id', 2:'symptom_name', 3:'question', 4:'answer', 5:'response'}, inplace=True, axis=1)
display(df_flat_tskids_vs_symps.head(3))

Unnamed: 0,task_id,symptom_id,symptom_name,question,answer,response
0,,1485,limb injury,2726,6357,6357
1,,1485,limb injury,2723,6342,6342
2,,1485,limb injury,2724,6367,6367


In [10]:
display(df_flat_tskids_vs_symps[df_flat_tskids_vs_symps['response'] == '5088'])

Unnamed: 0,task_id,symptom_id,symptom_name,question,answer,response
103,103.0,1451,general questions,1482,5088,5088
219,177.0,1451,general questions,1482,5088,5088
252,197.0,1451,general questions,1482,5087,5088
441,337.0,1451,general questions,1482,5088,5088
455,346.0,1451,general questions,1482,5086,5088
...,...,...,...,...,...,...
132864,24552.0,1451,general questions,1482,5087,5088
132972,24564.0,1451,general questions,1482,5088,5088
133366,24608.0,1451,general questions,1482,5088,5088
133386,24609.0,1451,general questions,1482,5088,5088


In [11]:
df_flat_tskids_vs_symps.to_csv('/Users/ezequiellowi/Documents/TEMP/question_response_symptomid_task_id.csv')

Now we'll check wich concepts are mapped to q_a which question, answer id.

In [12]:
q_a_tupples = list(zip(df_flat_tskids_vs_symps.question, df_flat_tskids_vs_symps.answer))
q_a_tupples = set(q_a_tupples)

In [13]:
q_a_tups_str = ','.join([f'{tup}' for tup in q_a_tupples])
# print(q_a_tups_str)

SELECT drcqm.id, dr_concept_id, drc.uuid, question_id as question, answer_id as response, question_title, answer_title as response_text
  FROM dr_concepts_questionnaire_mapping drcqm left join dr_concepts drc on drcqm.dr_concept_id = drc.id
 WHERE (question_id, answer_id) IN (
                                 {q_a_tups_str}
                                 );


In [16]:
query_str = f"""
SELECT drcqm.id, dr_concept_id, drc.uuid, question_id as question, answer_id as response, question_title, answer_title as response_text
  FROM dr_concepts_questionnaire_mapping drcqm left join dr_concepts drc on drcqm.dr_concept_id = drc.id
 WHERE (question_id, answer_id) IN (
                                 {q_a_tups_str}
                                 );
"""

# query_str = f"""
# select * from dr_concepts_questionnaire_mapping
# where answer_id in ({unique_symps_str});
# """
# print(query_str)
df_qa_concepts = pd.read_sql_query(query_str,c_engine, index_col='id')
display(df_qa_concepts.head(5))

Unnamed: 0_level_0,dr_concept_id,uuid,question,response,question_title,response_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,70,288d55eafbf44c8e9bda129283fda590,760,2529,how has the headache changed since it first be...,the pain is constant
2,71,fd5d84ba66a2453899123300dcd5ad1a,760,2531,how has the headache changed since it first be...,the pain is getting worse
3,72,4c64ceb907e9421c9e28c7007aa4ce04,760,2532,how has the headache changed since it first be...,the pain is getting better
4,73,5f03a71801424198be52c93e552b1513,760,5141,how has the headache changed since it first be...,the pain comes and goes
5,74,fd47ea66c62743f98cb34763de7fda74,760,5143,how has the headache changed since it first be...,the pain has passed


In [None]:
df_qa_concepts.to_csv('/Users/ezequiellowi/Documents/TEMP/question__response__dr_concept_id.csv')

In [None]:
concept_ids = set(df_qa_concepts['dr_concept_id'].to_list())
# print(concept_ids)
concept_ids_str = ','.join([str(cid) for cid in concept_ids])
concept_ids_str

In [None]:
query_str = f"""
SELECT id, uuid, name
FROM dr_concepts
where id in ({concept_ids_str})
"""

dr_concepts_in_use = pd.read_sql_query(query_str,c_engine, index_col='id')
display(dr_concepts_in_use.tail(5))

In [None]:
dr_concepts_in_use.to_csv('/Users/ezequiellowi/Documents/TEMP/dr_concepts_id_to_uuid_with_names.csv')

Back to the concept records with the concepts with the relevant symptoms

In [None]:
udids = df_concepts_with_symptoms['uuid'].to_list()
udids_str = ','.join([f"'{uuid}'" for uuid in udids])
query_str = f"""
select recc.dr_concept_uuid, recc.value, rec.external_id as task_id, tmr.symptom_id as chief from record_concepts recc
left join records rec on recc.record_id = rec.id
left join tagging_tasks tts on rec.external_id = tts.id
left join tagging_medical_records tmr on tts.medical_record_id = tmr.id
where recc.dr_concept_uuid in ({udids_str});
"""
# print(query_str)
df_concepts_vs_values = pd.read_sql_query(query_str,engine, index_col='dr_concept_uuid')
display(df_concepts_vs_values.head(5))

In [None]:
df_concepts_vs_values.to_csv('/Users/ezequiellowi/Documents/TEMP/dr_concept_uuid_vs_value_vs_task_id.csv')

## Backpain Q/A mapping to concepts

### Handle Missing

In [None]:
sample = {"1482__-1": 14, "1482__10323": 9, "2319__-1": 15, "2319__10331": 1, "3238__-1": 2, "3238__1058": 48, "3238__1464": 118, "3238__205": 1, "3238__3191": 13, "3238__725": 239, "3238__828": 96, "3859__10138": 98, "3859__10139": 21, "3860__10140": 1, "3860__10141": 5, "3860__10142": 2, "3860__10143": 1, "3861__10144": 93, "3861__10145": 142, "3862__-1": 22, "3862__10146": 48, "3862__10147": 63, "3862__10148": 271, "3862__10149": 67, "3862__10150": 164, "3862__10151": 55, "3862__10152": 194, "3863__10153": 1, "3863__10154": 5, "3863__10158": 1, "3863__10159": 1, "3864__None": 40, "3865__None": 24, "3866__10160": 3, "3866__10161": 2, "3866__10162": 1, "3866__10163": 1, "3867__-1": 2, "3867__10165": 534, "3867__10166": 86, "3867__10167": 114, "3867__10168": 609, "3867__10169": 183, "3867__10170": 95, "3867__10171": 23, "3867__10172": 114, "3867__10224": 217, "3868__10173": 24, "3868__10174": 28, "3868__10175": 92, "3868__10176": 19, "3868__10177": 11, "3868__10178": 85, "3868__10235": 62, "3868__10236": 28, "3868__10311": 40, "3869__10179": 95, "3869__10180": 567, "3870__10181": 35, "3870__10182": 183, "3870__10183": 25, "3870__10324": 18, "3871__-1": 6, "3871__10184": 232, "3871__10185": 137, "3871__10186": 158, "3871__10187": 39, "3871__10188": 16, "3871__10189": 95, "3871__10190": 64, "3871__10191": 84, "3871__10192": 5, "3871__10193": 64, "3871__10230": 18, "3871__10298": 7, "3871__10310": 46, "3871__10321": 144, "3872__-1": 14, "3872__10194": 549, "3872__10195": 225, "3872__10196": 369, "3872__10197": 224, "3872__10198": 206, "3872__10199": 75, "3872__10200": 103, "3872__10201": 163, "3872__10202": 49, "3872__10203": 216, "3872__10275": 90, "3872__10276": 84, "3872__10290": 74, "3872__10291": 18, "3872__10293": 704, "3872__10294": 122, "3872__10296": 84, "3872__10304": 33, "3872__10320": 56, "3872__10325": 10, "3872__10326": 9, "3872__10328": 7, "3872__10330": 3, "3873__10204": 434, "3873__10205": 1345, "3873__10206": 558, "3873__10207": 88, "3873__10208": 644, "3873__10209": 601, "3873__10268": 607, "3873__10303": 1, "3874__None": 236, "3887__None": 1471, "3888__10213": 687, "3888__10214": 310, "3888__10215": 125, "3888__10216": 8, "3888__10217": 14, "3888__10218": 8, "3888__10219": 50, "3889__-1": 5, "3889__10220": 115, "3889__10221": 490, "3889__10222": 270, "3889__10223": 75, "3889__10306": 124, "3890__-1": 1, "3890__10225": 171, "3890__10226": 262, "3890__10227": 142, "3890__10228": 50, "3890__10229": 116, "3891__10231": 206, "3891__10232": 108, "3891__10233": 41, "3891__10305": 111, "3892__None": 134, "3893__None": 91, "3907__-1": 1, "3907__10237": 1191, "3907__10238": 290, "3907__10239": 123, "3907__10240": 22, "3907__10241": 776, "3907__10295": 72, "3907__10297": 9, "3907__10322": 83, "3908__-1": 2, "3908__10242": 216, "3908__10243": 181, "3908__10244": 351, "3908__10245": 40, "3908__10246": 49, "3908__10247": 69, "3908__10248": 516, "3908__10249": 16, "3908__10250": 250, "3908__10277": 1002, "3908__10316": 129, "3908__10317": 67, "3908__10318": 72, "3909__-1": 3, "3909__10251": 293, "3909__10252": 730, "3909__10253": 649, "3909__10254": 313, "3909__10255": 100, "3909__10256": 13, "3909__10263": 141, "3909__10278": 43, "3909__10279": 10, "3909__10280": 11, "3909__10281": 6, "3909__10282": 18, "3909__10283": 2, "3909__10284": 2, "3909__10313": 12, "3909__10314": 25, "3909__10315": 41, "3909__10319": 145, "3910__-1": 11, "3910__10257": 407, "3910__10258": 176, "3910__10259": 443, "3910__10260": 251, "3910__10261": 540, "3910__10262": 434, "3911__None": 268, "3912__10264": 33, "3912__10265": 112, "3912__10266": 57, "3912__10267": 171, "3913__-1": 1, "3913__10269": 80, "3913__10270": 53, "3913__10273": 32, "3913__10274": 42}

In [None]:
sorted_missings = {k: v for k, v in sorted(sample.items(), key=lambda item: item[1], reverse=True)}


In [None]:
# print(sorted_missings)
raw_missing_df = pd.DataFrame.from_dict(sorted_missings, orient='index') 
display(raw_missing_df)
raw_missing_df.to_csv('/Users/ezequiellowi/Documents/TEMP/missing_backk_pain.txt')

In [None]:
questions = []
# answers = []

with open('/Users/ezequiellowi/staam.txt', 'w+') as f:
    f.write('queston,answer,number_of_times\n')
    for qacount in sorted_missings:
        qa = qacount.split('__')
        q = qa[0]
        questions.append(q)
        a = qa[1]
#         answers.append(a)
        count = sorted_missings[qacount]
        f.write(f'{q},{a},{count}\n')

    f.close()


In [None]:
print(questions)

In [None]:
u_questions = list(set(questions)) 

questions_str = ','.join(u_questions)


query_str = f"""
SELECT question_id, title
FROM questionnaire_tagging.question_translation
WHERE question_id in ({questions_str}) AND language_tag = 'en';
"""

print(query_str)
questions_df = pd.read_sql(query_str, engine)

display(questions_df.head(3))
display(questions_df.tail(3))
# questions_df.to_csv('/Users/ezequiellowi/missing_questoins.csv')


In [None]:
query_str = f"""
SELECT a2.question_id, ast.answer_id, qts.title as question, ast.title as answer
FROM questionnaire_tagging.answer_translation ast
INNER JOIN questionnaire_tagging.answers a2 ON ast.answer_id = a2.id
INNER JOIN questionnaire_tagging.questions qs ON a2.question_id = qs.id
INNER JOIN questionnaire_tagging.question_translation qts ON qts.question_id = qs.id
WHERE a2.question_id in ({questions_str}) 
	AND ast.language_tag = 'en' 
	AND a2.valid = 1
	AND qts.language_tag = 'en';"""

answers_df = pd.read_sql(query_str, engine)

display(answers_df.head(3))
display(answers_df[answers_df['answer_id']==188])

In [None]:
def add
missing_df = pd.DataFrame()
missing_df['question_id'] = questions_df['question_id']
missing_df['question_id'] = questions_df['question_id']


# DB Experiments

In [None]:
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [None]:
engine = create_engine('mysql+pymysql://root@localhost:3306/questionnaire_tagging')

In [None]:
heb_str = 'מרון\,  אני /11 -28/10\, חמישה ימים'
enc_str = heb_str.encode('utf-8')
str_enc_str = str(enc_str)[2:-1].replace('x','u00')
print(heb_str)
print(str_enc_str)

In [None]:
query_str = f"""
INSERT INTO questionnaire_tagging.symptom_synonym
(symptom_id, code, synonym)
VALUES
(198, 'codecode', '{str_enc_str}');
"""

print(query_str)

with engine.connect() as conn:
    res_proxy = conn.execute(query_str)
