In [9]:
import os
import pandas as pd
import numpy as np
import json
import sys

sys.path.append("/home/wsl_legion/Cross-Care/")
from dicts.dict_medical import medical_keywords_dict
from dicts.dict_census_est import census_dict


#### Helper functions ####
# Code to Disease
def replace_disease_names(df, medical_keywords_dict):
    """
    Replace disease column names with names in a DataFrame.

    :param df: DataFrame with disease names/codes as column headers.
    :param medical_keywords_dict: Dictionary mapping codes to lists of names.
    :return: DataFrame with updated disease column names.
    """
    for col in df.columns:
        # Check if the column name is in the dictionary
        if col in medical_keywords_dict:
            # Rename the column to the first name in the list from the dictionary
            df.rename(columns={col: medical_keywords_dict[col][0]}, inplace=True)
    return df


# Write to JSON
def write_to_json(data, filename):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w") as outfile:
        json.dump(data, outfile)


def process_temporal_counts(csv_path, medical_keywords_dict):
    df = pd.read_csv(csv_path)
    df["timestamp"] = pd.to_datetime(df["timestamp"])

    # Assuming all other columns except 'timestamp' are count data
    count_columns = df.columns.drop("timestamp")

    # Counts
    monthly_counts = df.groupby(pd.Grouper(key="timestamp", freq="M"))[
        count_columns
    ].sum()
    yearly_counts = df.groupby(pd.Grouper(key="timestamp", freq="Y"))[
        count_columns
    ].sum()

    # For five-yearly counts, first calculate the 5-year intervals
    df["five_year_interval"] = (df["timestamp"].dt.year // 5) * 5
    five_yearly_counts = df.groupby("five_year_interval")[count_columns].sum()

    # Pivot and process
    def pivot_and_process(df_counts, freq):
        df_counts = df_counts.T.reset_index()
        df_counts.rename(columns={"index": "disease"}, inplace=True)
        df_counts = replace_disease_codes(df_counts, medical_keywords_dict)
        df_counts["freq"] = freq
        return df_counts

    # Pivoting each DataFrame
    monthly_pivot = pivot_and_process(monthly_counts, "M")
    yearly_pivot = pivot_and_process(yearly_counts, "Y")
    five_yearly_pivot = pivot_and_process(five_yearly_counts, "5Y")

    # Ensure all column headers are strings
    monthly_pivot.columns = monthly_pivot.columns.astype(str)
    yearly_pivot.columns = yearly_pivot.columns.astype(str)
    five_yearly_pivot.columns = five_yearly_pivot.columns.astype(str)

    # Convert to dictionaries
    monthly_data = monthly_pivot.to_dict(orient="records")
    yearly_data = yearly_pivot.to_dict(orient="records")
    five_yearly_data = five_yearly_pivot.to_dict(orient="records")

    # Combine all data into a single dictionary
    all_data = {
        "monthly": monthly_data,
        "yearly": yearly_data,
        "five_yearly": five_yearly_data,
    }

    return all_data

In [10]:
# paths
count_dir = "output_arxiv"
out_dir = "cross-care-dash/app/data_debug/"

# window sizes
window_sizes = [10, 50, 100, 250]
demo_cat = ["gender", "racial", "drug"]

#### TOTAL COUNTS ####

# # Process date co-occurrence counts
# total_dates_data = process_temporal_counts(
#     csv_path=f"{count_dir}/disease_date_counts.csv",
#     medical_keywords_dict=medical_keywords_dict,
# )
# write_to_json(total_dates_data["monthly"], f"{out_dir}/monthly_counts.json")
# write_to_json(total_dates_data["yearly"], f"{out_dir}/yearly_counts.json")
# write_to_json(
#     total_dates_data["five_yearly"],
#     f"{out_dir}/five_yearly_counts.json",
# )

In [11]:
df = pd.read_csv("../output_arxiv/disease_date_counts.csv")

df["timestamp"] = pd.to_datetime(df["timestamp"])

# Assuming all other columns except 'timestamp' are count data
count_columns = df.columns.drop("timestamp")

# Counts
monthly_counts = df.groupby(pd.Grouper(key="timestamp", freq="M"))[count_columns].sum()
yearly_counts = df.groupby(pd.Grouper(key="timestamp", freq="Y"))[count_columns].sum()

# For five-yearly counts, first calculate the 5-year intervals
df["five_year_interval"] = (df["timestamp"].dt.year // 5) * 5
five_yearly_counts = df.groupby("five_year_interval")[count_columns].sum()

# Convert index to column
monthly_counts.reset_index(inplace=True)

print(monthly_counts)

     timestamp  Unnamed: 0  hiv/aids  covid-19  takotsubo cardiomyopathy  \
0   1991-11-30           0         0         0                         0   
1   1991-12-31           0         0         0                         0   
2   1992-01-31           1         0         0                         0   
3   1992-02-29           0         0         0                         0   
4   1992-03-31           5         0         0                         0   
..         ...         ...       ...       ...                       ...   
372 2022-11-30   113248009        29      2666                         0   
373 2022-12-31    95545961        19      2161                         0   
374 2023-01-31    93830919        15      2144                         0   
375 2023-02-28   111923410        22      1739                         0   
376 2023-03-31     6738525         1       267                         0   

     tuberculoses  endocarditis  syphilis  hypertension  sarcoidoses  ...  \
0         

In [13]:
# Rename columns
df.rename(columns={"timestamp": "date", "Unnamed: 0": "total_count"}, inplace=True)
df = replace_disease_names(df, medical_keywords_dict)
print(df.head())
# Convert to JSON
# json_output = df.to_json(orient="records")

   total_count                date  human immunodeficiency virus  \
0            0 1991-11-19 16:32:56                             0   
1            1 1992-01-15 08:42:00                             0   
2            2 1992-03-12 07:11:00                             0   
3            3 1992-03-18 16:11:28                             0   
4            4 1992-05-06 16:46:18                             0   

   2019 novel coronavirus  takotsubo cardiomyopathy  tuberculoses  \
0                       0                         0             0   
1                       0                         0             0   
2                       0                         0             0   
3                       0                         0             0   
4                       0                         0             0   

   endocarditis  syphilis  hypertension  sarcoid  ...  pancreatitis  \
0             0         0             0        0  ...             0   
1             0         0         

In [None]:
def process_temporal_counts(csv_path, medical_keywords_dict):
    df = pd.read_csv(csv_path)
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    print(df.head())

    # Assuming all other columns except 'timestamp' are count data
    count_columns = df.columns.drop("timestamp")
    print(count_columns)

    # Counts
    monthly_counts = df.groupby(pd.Grouper(key="timestamp", freq="M"))[
        count_columns
    ].sum()
    yearly_counts = df.groupby(pd.Grouper(key="timestamp", freq="Y"))[
        count_columns
    ].sum()

    # For five-yearly counts, first calculate the 5-year intervals
    df["five_year_interval"] = (df["timestamp"].dt.year // 5) * 5
    five_yearly_counts = df.groupby("five_year_interval")[count_columns].sum()

    print(monthly_counts.head())

    # Pivot and process
    def pivot_and_process(df_counts, freq):
        df_counts = df_counts.reset_index()
        df_counts.rename(
            columns={"timestamp": "date", "Unnamed: 0": "total_count"}, inplace=True
        )
        df_counts = replace_disease_names(df_counts, medical_keywords_dict)
        df_counts["freq"] = freq
        # Convert to JSON
        json_output = df_counts.to_json(orient="records")
        return json_output

    # Pivoting each DataFrame
    monthly_pivot = pivot_and_process(monthly_counts, "M")
    yearly_pivot = pivot_and_process(yearly_counts, "Y")
    five_yearly_pivot = pivot_and_process(five_yearly_counts, "5Y")

    # # Ensure all column headers are strings
    # monthly_pivot.columns = monthly_pivot.columns.astype(str)
    # yearly_pivot.columns = yearly_pivot.columns.astype(str)
    # five_yearly_pivot.columns = five_yearly_pivot.columns.astype(str)

    # # Convert to dictionaries
    # monthly_data = monthly_pivot.to_dict(orient="records")
    # yearly_data = yearly_pivot.to_dict(orient="records")
    # five_yearly_data = five_yearly_pivot.to_dict(orient="records")

    print(monthly_pivot[:10])
    # Combine all data into a single dictionary
    all_data = {
        "monthly": monthly_pivot,
        "yearly": yearly_pivot,
        "five_yearly": five_yearly_pivot,
    }

    return all_data