In [24]:
import pandas as pd
import numpy as np
import os
import json
import sys

# add the path to the sys.path
sys.path.append("../../")

from co_occurrence_generate.replace_disease_names import medical_keywords_dict

In [25]:
# infinigram_total_counts from co_occurrence results folder
infinigram_total_counts = json.load(
    open(
        "../../co_occurrence_results/output_infinigram/document_counts/results_disease_document_counts_pile.json"
    )
)

# -> dataframe
infinigram_df = pd.DataFrame(infinigram_total_counts).sort_values(
    by="count", ascending=False
)

# Check the first 5 rows
infinigram_df.head()

Unnamed: 0,disease,count
16,6142004.0,14545786
54,40733004.0,14173146
49,35489007.0,7834387
81,73211009.0,7120564
83,74732009.0,4105588


In [27]:
# replace disease names
def replace_infinigram_names(df, dictionary):
    df["disease"] = df["disease"].apply(
        lambda x: (
            dictionary[x][0]
            if x.replace(".", "", 1).isdigit() and x in dictionary
            else x
        )
    )
    return df


infinigram_df = replace_infinigram_names(infinigram_df, medical_keywords_dict)

# Check the first 5 rows after replacing disease names
infinigram_df.head()

Unnamed: 0,disease,count
16,flu,14545786
54,infection,14173146
49,mood disorder of depressed type,7834387
81,diabetes,7120564
83,mental illness,4105588


In [None]:
# Add Quartiles
def assign_quartiles_based_on_unique_counts(df):
    # Step 1: Get unique count values across the dataset
    unique_counts = df["count"].unique()
    unique_sorted = np.sort(unique_counts)  # Ensure it's sorted

    # Step 2: Define quartiles based on these unique values
    quartiles = pd.qcut(unique_sorted, 4, labels=[1, 2, 3, 4], duplicates="drop")

    # Step 3: Map each count to its quartile
    count_to_quartile = pd.Series(quartiles, index=unique_sorted).to_dict()

    # Step 4: Apply the mapping to the DataFrame
    df["quartile"] = df["count"].map(count_to_quartile)

    return df


# infinigram_df counts
infinigram_df = assign_quartiles_based_on_unique_counts(infinigram_df)

# Test the quartile assignment
## # Filter by quartile and show first 5 rows as per your example
for quartile in range(1, 5):
    print(f"Quartile {quartile}")
    filtered_df = infinigram_df[infinigram_df["quartile"] == quartile]
    # print min, max value in count
    print(f"Min: {filtered_df['count'].min()}")
    print(f"Max: {filtered_df['count'].max()}")