In [33]:
import pandas
from matplotlib import pyplot as plt
import numpy as np
import utils

t = 4

In [34]:
if t == 1:
    disease='SystemicSclerosis'
    disease_fullname = 'Systemic Sclerosis'
    symp_types=['formal', 'informal']
    model_names = [ 'chatgpt-4o']
    keywords_list =  ['sclerosis','scleroderma']

if t == 2:
    disease='MCTD'
    disease_fullname = 'Mixed Connective Tissue Disease'
    symp_types=['formal', 'informal']
    model_names = ['chatgpt-4o']
    keywords_list = ['mctd', 'mixed connective tissue disease']

if t == 3:
    disease='JDM'
    disease_fullname = 'Juvenile Dermatomyositis'
    symp_types=['formal', 'informal']
    model_names = [ 'chatgpt-4o' ]
    keywords_list=['jdm', 'dermatomyositis']

if t == 4:
    disease='SLE'
    disease_fullname = 'Systemic Lupus Erythematosus' 
    symp_types=['formal', 'informal']
    model_names = [ 'chatgpt-4o']
    keywords_list = ['sle','lupus','systemic lupus erythematosus']

In [35]:
# for each symptom, generate four numbers
#    fraction of times the correct diagnosis is in top 3 when symptom is present, 
#    fraction of times the correct diagnosis is in top 3 when symptom is not present,
#    fraction of times the correct diagnosis is NOT in top 3 when symptom is present,
#    fraction of times the correct diagnosis is NOT in top 3 when symptom is NOT present

def generate_table_for_symptom(all_patients, s):
    s1 = s2 = s3 = s4 = 1
    for p in all_patients:
        resp_list = p.get_top_three_symptoms(3)
        symp_in_symptom_list = s in p.symptom_list
        top_3_correct = utils.approx_matches(keywords_list, resp_list)
        if top_3_correct:
            if symp_in_symptom_list:
                s1 = s1 + 1
            else:
                s2 = s2 + 1
        else:
            if symp_in_symptom_list:
                s3 = s3 + 1
            else:
                s4 = s4 + 1 
    n = len(all_patients)
    return (s1, s2, s3, s4)


In [36]:

def gather_all_symptoms(all_patients):
    return sorted(set([symp for p in all_patients for symp in p.symptom_list]))
    

In [37]:
import math 
import statsmodels.api as sm 
import scipy.stats as stats

# Pearson rho coefficient by building contingency table 
for s in symp_types:
    for m in model_names:
        fname = f'{disease}-{s}-{m}.txt'
        print(fname)
        all_patients = utils.process_file(fname)
        all_symptoms = gather_all_symptoms(all_patients)
        #print(all_symptoms)
        #for symps in all_symptoms:
        #    (f1, f2, f3, f4) = generate_table_for_symptom(all_patients, symps)
        #    print(f'\t {disease} in top-3 + {symps} present: {f1}')
        #    print(f'\t {disease} in top-3 + {symps} not present: {f2}')
        #    print(f'\t {disease} not in top-3 + {symps} present: {f3}')
        #    print(f'\t {disease} not in top-3 + {symps} not present: {f4}')
        #    pearson_phi  = ( f1 * f4  - f2 * f3) / math.sqrt((f1+f3)*(f2+f4)*(f1+f2)*(f3+f4))
        #    print(f'\t {disease} + {symps} correlation is {pearson_phi}')
        dict = { symps: [symps in p.symptom_list for p in all_patients] for symps in all_symptoms }
        dict['outcome_top'] =  [utils.approx_matches(keywords_list, p.get_top_three_symptoms(1)) for p in all_patients]
        df = pandas.DataFrame(dict)
        X = df[all_symptoms]  # Include all your predictor variables
        y = df['outcome_top']
        # Fit the logistic regression model
        #model = sm.Logit(y, X)
        #result = model.fit(method='newton', maxiter=50)
        # print all contingency tables
        # for symps in all_symptoms:
        #    tab = pandas.crosstab(df[symps], df['outcome_top'])
        #    display(tab)
        #    v = tab.values
            #print(tab.values)
        #    cond_prob1 =  (v[1][1])/(v[1][1] + v[1][0])
        #    print(f'P( {disease} is top | {symps} ) = {cond_prob1}') 
        #    cond_prob2 = (v[1][1])/(v[0][1] + v[1][1])
        #    print(f'P( {symps} reported | {disease} is top ) = {cond_prob2}')
            #print(f'Odds Ratio = {v[1][1] * v[0][0] / (v[0][1] * v[1][0])}')
        # Print the summary
        #print(result.summary())

SLE-formal-chatgpt-4o.txt
SLE-informal-chatgpt-4o.txt


In [38]:
import re
def is_symptom_mentioned_in_reasoning(symp, reasoning):
    if symp in reasoning:
        return True
    # split symptom into words
    words = re.split(r'[,\s]+', symp)
    common_symptom_words = ['sign','to', 'can\'t', 'cannot', 'up', 'in', 'on', 'and', 'with', 'activity', 'activities', 'rash', 'rashes', 'disease', 'phenomenon','symptom', 'system', 'systems', 'change', 'changes', 'diseases', 'symptoms', 'conditions', 'involvement','pain', 'discomfort', 'condition']
    return any([w.lower() in reasoning for w in words if not(w in common_symptom_words) ])
    
def symptoms_mentioned_in_reasoning(pat, logfile, k=3):
    responses = pat.responses
    symptoms_list = pat.symptom_list
    for i in range(1,k+1):
        if i not in responses:
            continue 
        (diag, _, reasoning) = responses[i]
        if utils.approx_matches(keywords_list, [diag]):
            # match found 
            munged_reasoning = reasoning.lower()
            mentioned_symptoms = [ s for (j,s) in enumerate(symptoms_list) if is_symptom_mentioned_in_reasoning(s.lower(), munged_reasoning)]
            unmentioned_symptoms = [s for s in symptoms_list if s not in mentioned_symptoms]
            print(f"# debug {diag} --> {reasoning} --> {mentioned_symptoms}, Unmentioned {unmentioned_symptoms}", file=logfile)
            # let's try to find hallucinated symptoms
            return sorted(mentioned_symptoms), sorted(unmentioned_symptoms)
    return None


In [39]:
# Pearson rho coefficient by building contingency table 
for st in symp_types:
    for mdl in model_names:
        fname = f'{disease}-{st}-{mdl}.txt'
        print('----')
        print(fname)
        logfilename =f'results/{disease}-{st}-{mdl}-LOG-Reasoning.txt'
        logfile = open(logfilename, 'w')
        all_patients = utils.process_file(fname)
        all_symptoms = gather_all_symptoms(all_patients)
        symptom_combos = []
        symptom_mention_frequencies = {s:0 for s in all_symptoms }
        symptom_unmention_frequencies = {s:0 for s in all_symptoms }
        symptom_hallucination_frequencies = {s:0 for s in all_symptoms }
        
        for p in all_patients:
            m = symptoms_mentioned_in_reasoning(p,  logfile)
            if m != None:
                (mentioned, unmentioned) = m
                if mentioned not in symptom_combos:
                    symptom_combos.append(mentioned)
                for k in mentioned:
                    symptom_mention_frequencies[k] += 1
                for k in unmentioned:
                    symptom_unmention_frequencies[k] += 1
        #mentioned_symptom_combos = set(symptom_combos)
        #print(symptom_combos)
        csv_filename = f'results/association-of-symptoms/{disease}-{st}-{mdl}-symptom-mentions.csv'
        csv_file = open(csv_filename, 'w')
        print( 'Symptom, # mentioned, # unmentioned', file=csv_file)
        for symps in all_symptoms:
            print(f'{symps}, {symptom_mention_frequencies[symps]}, {symptom_unmention_frequencies[symps]}', file=csv_file)
        csv_file.close()
        logfile.close()
            

----
SLE-formal-chatgpt-4o.txt
----
SLE-informal-chatgpt-4o.txt
