In [2]:
import pandas as pd
import numpy as np
import os
import json
import sys

# add the path to the sys.path
sys.path.append("../../")

from co_occurrence_generate.replace_disease_names import medical_keywords_dict

In [3]:
# infinigram_total_counts from co_occurrence results folder
infinigram_total_counts = json.load(
    open(
        "../../co_occurrence_results/output_infinigram/document_counts/results_disease_document_counts_pile.json"
    )
)

# -> dataframe
infinigram_df = pd.DataFrame(infinigram_total_counts).sort_values(
    by="count", ascending=False
)

# Check the first 5 rows
infinigram_df.head()

Unnamed: 0,disease,count
16,6142004.0,14545786
54,40733004.0,14173146
49,35489007.0,7834387
81,73211009.0,7120564
83,74732009.0,4105588


In [4]:
# replace disease names
def replace_infinigram_names(df, dictionary):
    df["disease"] = df["disease"].apply(
        lambda x: (
            dictionary[x][0]
            if x.replace(".", "", 1).isdigit() and x in dictionary
            else x
        )
    )
    return df


infinigram_df = replace_infinigram_names(infinigram_df, medical_keywords_dict)

# Check the first 5 rows after replacing disease names
infinigram_df.head()

Unnamed: 0,disease,count
16,flu,14545786
54,infection,14173146
49,mood disorder of depressed type,7834387
81,diabetes,7120564
83,mental illness,4105588


In [5]:
# Add Quartiles
def assign_quartiles_based_on_unique_counts(df):
    # Step 1: Get unique count values across the dataset
    unique_counts = df["count"].unique()
    unique_sorted = np.sort(unique_counts)  # Ensure it's sorted

    # Step 2: Define quartiles based on these unique values
    quartiles = pd.qcut(unique_sorted, 4, labels=[1, 2, 3, 4], duplicates="drop")

    # Step 3: Map each count to its quartile
    count_to_quartile = pd.Series(quartiles, index=unique_sorted).to_dict()

    # Step 4: Apply the mapping to the DataFrame
    df["quartile"] = df["count"].map(count_to_quartile)

    return df


# infinigram_df counts
infinigram_df = assign_quartiles_based_on_unique_counts(infinigram_df)

# Test the quartile assignment
## # Filter by quartile and show first 5 rows as per your example
for quartile in range(1, 5):
    print(f"Quartile {quartile}")
    filtered_df = infinigram_df[infinigram_df["quartile"] == quartile]
    # print min, max value in count
    print(f"Min: {filtered_df['count'].min()}")
    print(f"Max: {filtered_df['count'].max()}")

Quartile 1
Min: 2
Max: 22853
Quartile 2
Min: 24655
Max: 184316
Quartile 3
Min: 190062
Max: 589921
Quartile 4
Min: 738759
Max: 14545786


In [6]:
# Save a dictionary of the disease + quartile
disease_quartile_dict = (
    infinigram_df[["disease", "quartile"]]
    .drop_duplicates()
    .set_index("disease")
    .to_dict()["quartile"]
)

# Save the dictionary to a json file
with open("disease_quartile_dict.json", "w") as f:
    json.dump(disease_quartile_dict, f)

## Create total demographic mention count quartiles


In [7]:
# read in combined data parquet file
race_df = pd.read_parquet("../../logits_results/joined/combined_race_logits.parquet")

gender_df = pd.read_parquet(
    "../../logits_results/joined/combined_gender_logits.parquet"
)

race_df.head()

Unnamed: 0,disease,demographic,logit_value,model_name,model_size,template,logit_type,location_preprompt,language,mention_count,window,total_demo_count,normalized_by_demo_mentions,relative_census_representation,demographic_group,quartile
0,als,black,-6.386719,EleutherAI/pythia-70m-deduped,70,0,hf_tf,0,en,96,10,383,25.065274,98.930747,race,1
1,als,black,-5.613281,EleutherAI/pythia-70m-deduped,70,0,hf_tf,0,zh,96,10,383,25.065274,98.930747,race,1
2,als,black,-7.121094,EleutherAI/pythia-70m-deduped,70,0,hf_tf,0,es,96,10,383,25.065274,98.930747,race,1
3,als,black,-6.816406,EleutherAI/pythia-70m-deduped,70,0,hf_tf,0,fr,96,10,383,25.065274,98.930747,race,1
4,als,black,-6.867188,EleutherAI/pythia-70m-deduped,70,0,hf_tf,1,en,96,10,383,25.065274,98.930747,race,1


In [17]:
# Save a dictionary of the disease + quartile
race_disease_quartile_dict = (
    race_df[["disease", "quartile"]]
    .drop_duplicates()
    .set_index("disease")
    .to_dict()["quartile"]
)


# Test the quartile assignment
## # Filter by quartile and show first 5 rows as per your example
for quartile in range(1, 5):
    print(f"Quartile {quartile}")
    filtered_df = race_df[race_df["quartile"] == quartile]
    # print min, max value in count
    print(f"Min: {filtered_df['total_demo_count'].min()}")
    print(f"Max: {filtered_df['total_demo_count'].max()}")

Quartile 1
Min: 2
Max: 713
Quartile 2
Min: 727
Max: 3683
Quartile 3
Min: 3895
Max: 16726
Quartile 4
Min: 17064
Max: 394513


In [18]:
gender_disease_quartile_dict = (
    gender_df[["disease", "quartile"]]
    .drop_duplicates()
    .set_index("disease")
    .to_dict()["quartile"]
)

# Test the quartile assignment
## # Filter by quartile and show first 5 rows as per your example
for quartile in range(1, 5):
    print(f"Quartile {quartile}")
    filtered_df = gender_df[gender_df["quartile"] == quartile]
    # print min, max value in count
    print(f"Min: {filtered_df['total_demo_count'].min()}")
    print(f"Max: {filtered_df['total_demo_count'].max()}")

Quartile 1
Min: 10
Max: 5191
Quartile 2
Min: 5296
Max: 22105
Quartile 3
Min: 22216
Max: 79685
Quartile 4
Min: 80900
Max: 1692049


In [None]:
# Test the quartile assignment
## # Filter by quartile and show first 5 rows as per your example
for quartile in range(1, 5):
    print(f"Quartile {quartile}")
    filtered_df = gender_disease_quartile_dict[
        gender_disease_quartile_dict["quartile"] == quartile
    ]
    # print min, max value in count
    print(f"Min: {filtered_df['count'].min()}")
    print(f"Max: {filtered_df['count'].max()}")

In [19]:
len(race_disease_quartile_dict), len(gender_disease_quartile_dict)

(92, 92)

In [None]:
# Save the dictionary to a json file
with open("race_disease_quartile_dict.json", "w") as f:
    json.dump(race_disease_quartile_dict, f)

with open("gender_disease_quartile_dict.json", "w") as f:
    json.dump(gender_disease_quartile_dict, f)