In [19]:
import pickle

In [20]:
data_dir = 'word_distribution/'

icd_org_bow_path = 'icd_org_bow_dict.pkl'
icd_gen_bow_path = 'icd_gen_bow_dict.pkl'
phenotyping_org_bow_path = 'phenotyping_org_bow_dict.pkl'
phenotyping_gen_bow_path = 'phenotyping_gen_bow_dict.pkl'
readmission_org_bow_path = 'readmission_org_bow_dict.pkl'
readmission_gen_bow_path = 'readmission_gen_bow_dict.pkl'

icd_org_dict_path = 'icd_org_dict_dict.pkl'
icd_gen_dict_path = 'icd_gen_dict_dict.pkl'
phenotyping_org_dict_path = 'phenotyping_org_dict_dict.pkl'
phenotyping_gen_dict_path = 'phenotyping_gen_dict_dict.pkl'
readmission_org_dict_path = 'readmission_org_dict_dict.pkl'
readmission_gen_dict_path = 'readmission_gen_dict_dict.pkl'


icd_org_bow = pickle.load(open(data_dir + icd_org_bow_path, 'rb'))
icd_gen_bow = pickle.load(open(data_dir + icd_gen_bow_path, 'rb'))
phenotyping_org_bow = pickle.load(open(data_dir + phenotyping_org_bow_path, 'rb'))
phenotyping_gen_bow = pickle.load(open(data_dir + phenotyping_gen_bow_path, 'rb'))
readmission_org_bow = pickle.load(open(data_dir + readmission_org_bow_path, 'rb'))
readmission_gen_bow = pickle.load(open(data_dir + readmission_gen_bow_path, 'rb'))

icd_org_dict = pickle.load(open(data_dir + icd_org_dict_path, 'rb'))
icd_gen_dict = pickle.load(open(data_dir + icd_gen_dict_path, 'rb'))
phenotyping_org_dict = pickle.load(open(data_dir + phenotyping_org_dict_path, 'rb'))
phenotyping_gen_dict = pickle.load(open(data_dir + phenotyping_gen_dict_path, 'rb'))
readmission_org_dict = pickle.load(open(data_dir + readmission_org_dict_path, 'rb'))
readmission_gen_dict = pickle.load(open(data_dir + readmission_gen_dict_path, 'rb'))

In [21]:
def get_top_k_words(bow_dict, dict_dict, k):
    top_k_words = {}

    for cls, bow in bow_dict.items():
        dictionary = dict_dict[cls]
        
        # Convert BoW (list of (word_id, frequency)) into word and frequency
        word_freq = [(dictionary[word_id], freq) for word_id, freq in bow]
        
        # Sort by frequency in descending order
        sorted_word_freq = sorted(word_freq, key=lambda x: x[1], reverse=True)
        
        # Select the top k most frequent words
        top_k_words[cls] = sorted_word_freq[:k]

    return top_k_words


def filter_common_words(bow_dict, dict_dict):
    # Step 1: Extract words from all classes
    class_words = {}
    
    for cls, dictionary in dict_dict.items():
        # Get all words from the dictionary for the class
        class_words[cls] = set(dictionary.values())  # `dictionary.values()` gives all the words
    
    # Step 2: Find intersection of words across all classes
    common_words = set.intersection(*class_words.values())  # Intersection of all class word sets

    # Step 3: Filter out common words from BoW for each class
    filtered_bow_dict = {}
    
    for cls, bow in bow_dict.items():
        dictionary = dict_dict[cls]
        
        # Filter out common words from the BoW
        filtered_bow = [(word_id, freq) for word_id, freq in bow if dictionary[word_id] not in common_words]
        filtered_bow_dict[cls] = filtered_bow

    return filtered_bow_dict, common_words


import numpy as np

def calculate_entropy(top_k_words):
    entropy_dict = {}

    for cls, word_freq_list in top_k_words.items():
        # Get the frequencies of the top k words
        frequencies = np.array([freq for _, freq in word_freq_list], dtype=np.float32)
        
        # Normalize the frequencies to get probabilities
        total_freq = np.sum(frequencies)
        probabilities = frequencies / total_freq
        
        # Calculate entropy using the formula H = -sum(p * log(p))
        entropy = -np.sum(probabilities * np.log(probabilities + 1e-10))  # Added small value to avoid log(0)
        
        # Store the entropy for the current class
        entropy_dict[cls] = entropy

    return entropy_dict


In [22]:
def calculate_entropy_difference(entropy_dict_1, entropy_dict_2):
    # Initialize a dictionary to hold the differences
    entropy_difference = {}

    # Ensure both dictionaries have the same classes
    classes = sorted(set(entropy_dict_1.keys()).union(set(entropy_dict_2.keys())))

    # Calculate the differences
    for cls in classes:
        value_1 = entropy_dict_1.get(cls, 0)  # Default to 0 if class not in dict_1
        value_2 = entropy_dict_2.get(cls, 0)  # Default to 0 if class not in dict_2
        entropy_difference[cls] = value_2 - value_1  # Calculate the difference (dict_2 - dict_1)

    return entropy_difference

In [23]:
import matplotlib.pyplot as plt

def plot_three_entropy_dictionaries(entropy_dict_1, entropy_dict_2, entropy_dict_3):
    """
    Plots three entropy dictionaries in the same figure with different colors.

    Parameters:
        entropy_dict_1 (dict): The first entropy dictionary.
        entropy_dict_2 (dict): The second entropy dictionary.
        entropy_dict_3 (dict): The third entropy dictionary.
    """
    # Ensure all dictionaries have the same classes
    classes = sorted(set(entropy_dict_1.keys()).union(entropy_dict_2.keys()).union(entropy_dict_3.keys()))
    
    # Prepare data for plotting
    values_1 = [entropy_dict_1.get(cls, 0) for cls in classes]
    values_2 = [entropy_dict_2.get(cls, 0) for cls in classes]
    values_3 = [entropy_dict_3.get(cls, 0) for cls in classes]

    # Set up the plot
    plt.figure(figsize=(12, 6))
    
    # Plotting each dictionary
    bar_width = 0.25
    x = range(len(classes))
    
    # Plot each entropy dictionary as a bar
    plt.bar([i - bar_width for i in x], values_1, width=bar_width, label='Entropy Dict 1', color='skyblue')
    plt.bar(x, values_2, width=bar_width, label='Entropy Dict 2', color='salmon')
    plt.bar([i + bar_width for i in x], values_3, width=bar_width, label='Entropy Dict 3', color='lightgreen')
    
    # Adding labels and title
    plt.xlabel('Classes')
    plt.ylabel('Entropy Value')
    plt.title('Comparison of Entropy Values Across Three Dictionaries')
    
    # Adding ticks and grid
    plt.xticks(x, classes)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Add legend
    plt.legend()
    
    # Show the plot
    plt.tight_layout()
    plt.show()

# Example usage
# Assuming you have three entropy dictionaries
# entropy_dict_1 = {0: 1.5, 1: 1.2, 2: 0.8}
# entropy_dict_2 = {0: 2.0, 1: 0.5, 2: 1.0}
# entropy_dict_3 = {0: 1.8, 1: 0.9, 2: 1.3}

# plot_three_entropy_dictionaries(entropy_dict_1, entropy_dict_2, entropy_dict_3)


In [24]:
icd_org_dict_filtered, common_words = filter_common_words(icd_org_bow, icd_org_dict)
icd_org_top_k_words = get_top_k_words(icd_org_dict_filtered, icd_org_dict, 1000)
icd_org_entropy = calculate_entropy(icd_org_top_k_words)
icd_gen_dict_filtered, common_words = filter_common_words(icd_gen_bow, icd_gen_dict)
icd_gen_top_k_words = get_top_k_words(icd_gen_dict_filtered, icd_gen_dict, 1000)
icd_gen_entropy = calculate_entropy(icd_gen_top_k_words)

phenotyping_gen_bow_filtered, common_words = filter_common_words(phenotyping_gen_bow, phenotyping_gen_dict)
phenotyping_gen_top_k_words = get_top_k_words(phenotyping_gen_bow_filtered, phenotyping_gen_dict, 1000)
phenotyping_gen_entropy = calculate_entropy(phenotyping_gen_top_k_words)
phenotyping_org_bow_filtered, common_words = filter_common_words(phenotyping_org_bow, phenotyping_org_dict)
phenotyping_org_top_k_words = get_top_k_words(phenotyping_org_bow_filtered, phenotyping_org_dict, 1000)
phenotyping_org_entropy = calculate_entropy(phenotyping_org_top_k_words)

readmission_org_bow_filtered, common_words = filter_common_words(readmission_org_bow, readmission_org_dict)
readmission_org_top_k_words = get_top_k_words(readmission_org_bow_filtered, readmission_org_dict, 1000)
readmission_org_entropy = calculate_entropy(readmission_org_top_k_words)
readmission_gen_bow_filtered, common_words = filter_common_words(readmission_gen_bow, readmission_gen_dict)
readmission_gen_top_k_words = get_top_k_words(readmission_gen_bow_filtered, readmission_gen_dict, 1000)
readmission_gen_entropy = calculate_entropy(readmission_gen_top_k_words)


In [25]:
phenotyping_gen_top_k_words[0]

# make wordcloud from this list
from wordcloud import WordCloud

def plot_wordcloud(word_freq_list, title):
    # Initialize the WordCloud object
    wc = WordCloud(width=800, height=400, max_words=200, background_color='white')
    
    # Generate the word cloud
    wc.generate_from_frequencies(dict(word_freq_list))
    
    # Set up the plot
    plt.figure(figsize=(10, 10))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    
    # Show the plot
    plt.show()

#plot_wordcloud(phenotyping_gen_top_k_words[0], 'Phenotyping General Top 1000 Words')
#plot_wordcloud(phenotyping_gen_top_k_words[1], 'Phenotyping General Top 1000 Words')

In [26]:
import pickle

dict_path = '../Anonymization/negated_test_final_normalized_dict.pkl'
with open(dict_path, 'rb') as f:
    normalized_dict = pickle.load(f)

# count the number of words in org_dict for each class
# where the word is in normalized_dict.keys()
# and the normlized_dict[word] is in gen_dict
# or missing from gen_dict

def get_entity_difference(org_dict, gen_dict, normalized_dict):
    # lower case everything in normalized_dict
    #normalized_dict = {k.lower(): v.lower() for k, v in normalized_dict.items()}
    entity_diff = {}
    
    for cls, dictionary in org_dict.items():
        org_set = set(dictionary.values())
        gen_set = set(gen_dict[cls].values())
        org_ent = []
        for word in org_set:
            if word in normalized_dict.keys():
                org_ent.append(word)
        gen_ent = []
        for word in gen_set:
            if word in normalized_dict.keys():
                gen_ent.append(word)
        """
        diff_set = org_set - gen_set
        for word in list(diff_set):
            if word in normalized_dict.keys():
                if normalized_dict[word] in gen_set:
                    if cls not in entity_diff:
                        entity_diff[cls] = [word]
                    else:
                        entity_diff[cls].append(word)
                else:
                    if cls not in entity_diff:
                        entity_diff[cls] = ["<not-exist>"]
                    else:
                        entity_diff[cls].append("<not-exist>")
        """
        entity_diff[cls] = len(gen_set) / len(org_set)
    return entity_diff



In [27]:
print(list(normalized_dict.keys())[:100])

['"10 screws', '"4', '"93% blockage"', '"abdominal" cancer', '"abnormal looking', '"adverse rxn".', '"agonal', '"agonal breathing"', '"agonal breathing."', '"alcoholic coma"', '"allergy pills",', '"almost black"', '"altered"', '"anti-inflammatory effect".', '"antibiotics,"', '"arteriosclerosis"', '"arthritis,"', '"asthma', '"asthma" symptoms', '"babbling"', '"back problems"', '"back to back"', '"bad" taste in', '"balance disorder/falls"),', '"barrett\'s"', '"basal ganglia" hemorrhage on the left.', '"benign bladder lesions"', '"bigeminy"', '"black', '"black stools"', '"blacking out"', '"bladder medication"', '"bladder sling"', '"bloating."', '"blocked neck artery"', '"blood clots."', '"blood" cancer', '"blood" labs', '"blood",', '"blooming"', '"blooming" susceptibility', '"blue toes".', '"bone cancer"', '"bone on bone"', '"bone spurs"', '"bone"', '"bowel problem"', '"brain tumor",', '"breathing', '"breathing problems"', '"breathing treatments,"', '"bright\'s disease"', '"broken', '"bro

In [28]:
icd_entity_diff = get_entity_difference(icd_org_dict, icd_gen_dict, normalized_dict)
phenotyping_entity_diff = get_entity_difference(phenotyping_org_dict, phenotyping_gen_dict, normalized_dict)
readmission_entity_diff = get_entity_difference(readmission_org_dict, readmission_gen_dict, normalized_dict)

In [38]:
print(readmission_entity_diff)
print(icd_entity_diff)
print(phenotyping_entity_diff)

{0: 0.6297347316471315, 1: 0.6204670813284826}
{0: 0.6136229022704837, 1: 0.5755946225439503, 2: 0.52963841138115, 3: 0.5385674931129476, 4: 0.536134903640257, 5: 0.5572755417956656, 6: 0.5452205882352941, 7: 0.5136887608069164, 8: 0.5313161875945537, 9: 0.5899705014749262, 10: 0.5775460394941203, 11: 0.5904696132596685, 12: 0.6107360977177756, 13: 0.5402697495183044, 14: 0.5535045478865703, 15: 0.5405059237912264, 16: 0.5023458445040214, 17: 0.5791814946619217, 18: 0.582235153174739, 19: 0.573780305297815, 20: 0.6308695652173913, 21: 0.5056448853917208, 22: 0.5627212948912493, 23: 0.5312393308296347, 24: 0.5224568138195778, 25: 0.5416403785488959, 26: 0.5784916837548416, 27: 0.5907713819622465, 28: 0.5120288692862871, 29: 0.6734955185659411, 30: 0.5542384254081861, 31: 0.6005967604433078, 32: 0.5546659304251794, 33: 0.6067608044501498, 34: 0.6057233704292527, 35: 0.5759233926128591, 36: 0.6023160363529757, 37: 0.47693452380952384, 38: 0.5475940323597395, 39: 0.6641666666666667, 40: 0.

In [41]:
# take the average of missing entities percent for each dataset
icd_entity_diff_avg = np.mean(list(icd_entity_diff.values()))
phenotyping_entity_diff_avg = np.mean(list(phenotyping_entity_diff.values()))
readmission_entity_diff_avg = np.mean(list(readmission_entity_diff.values()))

# std
icd_entity_diff_std = np.std(list(icd_entity_diff.values()))
phenotyping_entity_diff_std = np.std(list(phenotyping_entity_diff.values()))
readmission_entity_diff_std = np.std(list(readmission_entity_diff.values()))

# max and min

icd_entity_diff_max = np.max(list(icd_entity_diff.values()))
phenotyping_entity_diff_max = np.max(list(phenotyping_entity_diff.values()))
readmission_entity_diff_max = np.max(list(readmission_entity_diff.values()))

icd_entity_diff_min = np.min(list(icd_entity_diff.values()))
phenotyping_entity_diff_min = np.min(list(phenotyping_entity_diff.values()))
readmission_entity_diff_min = np.min(list(readmission_entity_diff.values()))

icd_ent_diff = {"avg": icd_entity_diff_avg, "std": icd_entity_diff_std, "max": icd_entity_diff_max, "min": icd_entity_diff_min}
phenotyping_ent_diff = {"avg": phenotyping_entity_diff_avg, "std": phenotyping_entity_diff_std, "max": phenotyping_entity_diff_max, "min": phenotyping_entity_diff_min}
readmission_ent_diff = {"avg": readmission_entity_diff_avg, "std": readmission_entity_diff_std, "max": readmission_entity_diff_max, "min": readmission_entity_diff_min}

# to df
import pandas as pd

icd_ent_diff_df = pd.DataFrame(icd_ent_diff, index=["ICD"])
phenotyping_ent_diff_df = pd.DataFrame(phenotyping_ent_diff, index=["Phenotyping"])
readmission_ent_diff_df = pd.DataFrame(readmission_ent_diff, index=["Readmission"])

# concat

ent_diff_df = pd.concat([icd_ent_diff_df, phenotyping_ent_diff_df, readmission_ent_diff_df])

# round to 3 decimal places
ent_diff_df = ent_diff_df.round(3)

# to csv
ent_diff_df.to_csv("entity_diff.csv")

print(ent_diff_df)

               avg    std    max    min
ICD          0.561  0.040  0.673  0.477
Phenotyping  0.442  0.023  0.472  0.409
Readmission  0.625  0.005  0.630  0.620


In [31]:
icd_entropy_diff = calculate_entropy_difference(icd_org_entropy, icd_gen_entropy)
phenotyping_entropy_diff = calculate_entropy_difference(phenotyping_org_entropy, phenotyping_gen_entropy)
readmission_entropy_diff = calculate_entropy_difference(readmission_org_entropy, readmission_gen_entropy)


print(phenotyping_entropy_diff)
print(phenotyping_entity_diff)

# number of classes
icd_cls_num = len(icd_entropy_diff)
phenotyping_cls_num = len(phenotyping_entropy_diff)
readmission_cls_num = len(readmission_entropy_diff)

# average difference
icd_ave_diff = np.mean(list(icd_entropy_diff.values()))
phenotyping_ave_diff = np.mean(list(phenotyping_entropy_diff.values()))
readmission_ave_diff = np.mean(list(readmission_entropy_diff.values()))

# the number of positive classes
icd_pos_num = len([1 for v in icd_entropy_diff.values() if v > 0])
phenotyping_pos_num = len([1 for v in phenotyping_entropy_diff.values() if v > 0])
readmission_pos_num = len([1 for v in readmission_entropy_diff.values() if v > 0])
# percentage
icd_pos_per = icd_pos_num / len(icd_entropy_diff)*100
phenotyping_pos_per = phenotyping_pos_num / len(phenotyping_entropy_diff)*100
readmission_pos_per = readmission_pos_num / len(readmission_entropy_diff)*100

# to df where each column is different stats and each row is different dataset
import pandas as pd

stats_dict = {}
stats_dict['Dataset'] = ['readmission','icd', 'phenotyping']
stats_dict['Dataset'] = ['readmission','icd', 'phenotyping']
stats_dict['Number of Classes'] = [readmission_cls_num, icd_cls_num, phenotyping_cls_num]
stats_dict['Average Difference'] = [readmission_ave_diff, icd_ave_diff, phenotyping_ave_diff]
stats_dict['Positive Class Number'] = [f'{readmission_pos_num} ({readmission_pos_per:.2f})', f'{icd_pos_num} ({icd_pos_per:.2f})', f'{phenotyping_pos_num} ({phenotyping_pos_per:.2f})']
stats_dict['Entity Difference'] = [readmission_entity_diff_avg, icd_entity_diff_avg, phenotyping_entity_diff_avg]
stats_df = pd.DataFrame(stats_dict)
# round to 3 decimal places
stats_df = stats_df.round(3)

{0: -1.062892, 1: -1.1040044, 2: -0.9743438, 3: -0.18843985, 4: -0.6823597, 5: -0.06401539, 6: -0.69732237, 7: -0.08685303, 8: -0.25134325, 9: -0.26688766}
{0: 0.40926225094238017, 1: 0.4138817480719794, 2: 0.42353723404255317, 3: 0.4183504471679364, 4: 0.46494992846924177, 5: 0.4719600222098834, 6: 0.4707585408222351, 7: 0.4591136079900125, 8: 0.44716913090515686, 9: 0.44113263785394935}


In [32]:
stats_df.head(3)

Unnamed: 0,Dataset,Number of Classes,Average Difference,Positive Class Number,Entity Difference
0,readmission,2,0.446,2 (100.00),0.625
1,icd,50,0.02,36 (72.00),0.561
2,phenotyping,10,-0.538,0 (0.00),0.442


In [33]:
# to csv
stats_df.to_csv('dataset_stats.csv', index=False)


In [34]:
# argmin of phenotyping_entropy_diff
min_key = min(phenotyping_entropy_diff, key=phenotyping_entropy_diff.get)
print(min_key, phenotyping_entropy_diff[min_key])

1 -1.1040044


In [35]:
phenotyping_entropy_diff

{0: -1.062892,
 1: -1.1040044,
 2: -0.9743438,
 3: -0.18843985,
 4: -0.6823597,
 5: -0.06401539,
 6: -0.69732237,
 7: -0.08685303,
 8: -0.25134325,
 9: -0.26688766}

In [36]:
"""
['advanced_cancer',
 'obesity',
 'advanced_lung_disease',
 'chronic_pain_fibromyalgia',
 'alcohol_abuse',
 'depression',
 'other_substance_abuse',
 'chronic_neurological_dystrophies',
 'schizophrenia_and_other_psychiatric_disorders',
 'advanced_heart_disease']
"""

"\n['advanced_cancer',\n 'obesity',\n 'advanced_lung_disease',\n 'chronic_pain_fibromyalgia',\n 'alcohol_abuse',\n 'depression',\n 'other_substance_abuse',\n 'chronic_neurological_dystrophies',\n 'schizophrenia_and_other_psychiatric_disorders',\n 'advanced_heart_disease']\n"