In [2]:
import pandas as pd
import numpy as np
import zipfile

# Configuration
demographic_path = "brown2019_demographics.csv"
hba1c_path = "brown2019_hba1c.csv"
cgm_zip_path = "Brown2019.zip"
output_path = "brown_demographic_with_avg_hba1c.csv"

# Load data
demo_df = pd.read_csv(demographic_path)
hba1c_df = pd.read_csv(hba1c_path)

# 1. Calculate average HbA1c per PtID
print("Calculating average HbA1c from hba1c file...")
avg_hba1c = hba1c_df.groupby("PtID")["hba1c"].mean().reset_index()
avg_hba1c.columns = ["PtID", "avg_hba1c"]

# 2. Merge with demographic data
print("\nMerging with demographic data...")
merged_df = pd.merge(demo_df, avg_hba1c, on="PtID", how="left")


# 3. Add clinical groupings
def get_hba1c_group(value):
    if pd.isna(value):
        return "Unknown"
    value = float(value)
    if value < 7:
        return "<7%"
    if value < 8.5:
        return "7-8.5%"
    return "≥8.5%"


merged_df["hba1c_Group"] = merged_df["avg_hba1c"].apply(get_hba1c_group)


def get_age_group(age):
    if pd.isna(age):
        return "Unknown"
    age = float(age)
    if age < 10:
        return "<10"
    if age < 20:
        return "10-19"
    if age < 65:
        return "20-64"
    return "≥65"


merged_df["age_group"] = merged_df["AgeAtEnrollment"].apply(get_age_group)


# 4. Check CGM Participation - Updated for 5xxx.0_x.npy format
def check_cgm_participation(zip_file):
    """Extract IDs from filenames in format 5xxx.0_x.npy"""
    cgm_ids = set()
    for file in zip_file.namelist():
        if file.endswith(".npy") and not "_time.npy" in file:
            try:
                # Extract base filename (e.g., "5001.0_4.npy" → "5001")
                base_name = file.split("/")[-1]  # Handle subdirectories
                ptid = int(
                    base_name.split(".")[0]
                )  # Get the integer part before first dot
                cgm_ids.add(ptid)
            except (ValueError, IndexError):
                print(f"Couldn't parse ID from: {file}")
                continue
    return cgm_ids


print("\nChecking CGM participation...")
with zipfile.ZipFile(cgm_zip_path) as z:
    cgm_participants = check_cgm_participation(z)
    print(f"Found {len(cgm_participants)} unique subjects with CGM data")
    merged_df["has_cgm"] = merged_df["PtID"].isin(cgm_participants)

    # Print verification info
    print("\nFirst 5 CGM files found:")
    for file in z.namelist()[:5]:
        print(file)

# Identify missing subjects
missing_cgm = merged_df[~merged_df["has_cgm"]]["PtID"].unique()
print(f"\nSubjects without CGM data ({len(missing_cgm)}): {sorted(missing_cgm)}")

# Verify with sample
sample_check = merged_df[["PtID", "has_cgm"]].sample(5)
print("\nSample verification:")
print(sample_check)

# Summary report
print("\nFinal Summary:")
print("=" * 40)
print(f"Total subjects: {len(merged_df)}")
print(f"Subjects with HbA1c data: {merged_df['avg_hba1c'].notna().sum()}")
print(f"Subjects with CGM data: {merged_df['has_cgm'].sum()}")
print("\nHbA1c Group Distribution:")
print(merged_df["hba1c_Group"].value_counts(dropna=False))
print("\nAge Group Distribution:")
print(merged_df["age_group"].value_counts(dropna=False))

# Save results
merged_df.to_csv(output_path, index=False)
print(f"\nSaved to: {output_path}")

Calculating average HbA1c from hba1c file...

Merging with demographic data...

Checking CGM participation...
Found 167 unique subjects with CGM data

First 5 CGM files found:
5050.0_13.npy
5140.0_5.npy
5045.0_5.npy
5164.0_1.npy
5095.0_4.npy

Subjects without CGM data (3): [5033, 5068, 5084]

Sample verification:
     PtID  has_cgm
6    5007     True
99   5101     True
143  5145     True
35   5037     True
118  5120     True

Final Summary:
Total subjects: 170
Subjects with HbA1c data: 170
Subjects with CGM data: 167

HbA1c Group Distribution:
hba1c_Group
7-8.5%    96
<7%       64
≥8.5%     10
Name: count, dtype: int64

Age Group Distribution:
age_group
20-64    116
10-19     51
≥65        3
Name: count, dtype: int64

Saved to: brown_demographic_with_avg_hba1c.csv
