In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#demographic, grouping and append hba1c
import pandas as pd
import zipfile
from google.colab import drive

#drive.mount('/content/drive')

# Configuration
base_path = '/content/drive/MyDrive/Digital Health/T1_data_time_trial'
demographic_path = f'{base_path}/wadwa2023_demographic.csv'
hba1c_path = f'{base_path}/Wadwa_hba1c.csv'
trial_zip_path = f'{base_path}/wadwa2023.zip'
output_path = f'{base_path}/Wadwa2023_demographic_with_groups.csv'

# Load data
demo_df = pd.read_csv(demographic_path)
hba1c_df = pd.read_csv(hba1c_path)

# Merge HbA1c data
merged_df = pd.merge(demo_df, hba1c_df[['PtID', 'hba1c']], on='PtID', how='left')

# Add Age Group
def get_age_group(age):
    if pd.isna(age): return 'Unknown'
    try:
        age = float(age)
        if age < 10: return '<10'
        if age < 20: return '<20'
        if age <= 65: return '20-65'
        return '≥65'
    except:
        return 'Unknown'

merged_df['Age_group'] = merged_df['AgeAsofEnrollDt'].apply(get_age_group)

# Enhanced trial participation check
def has_trial_data(ptid):
    """Check for CGM files with both:
    1. Original PtID format (1_)
    2. 2xxx format (2001_)"""
    try:
        with zipfile.ZipFile(trial_zip_path) as z:
            # Try both possible formats
            patterns = [
                f"{ptid}_",       # Original format (1_)
                f"2{int(ptid):03d}_"  # 2xxx format (2001_)
            ]

            # Check all files in zip
            for f in z.namelist():
                if any(f.startswith(p) for p in patterns) and \
                   f.endswith('.npy') and \
                   not f.endswith('_time.npy'):
                    return True
            return False
    except Exception as e:
        print(f"Error checking trial data for PtID {ptid}: {str(e)}")
        return False

merged_df['In_trial'] = merged_df['PtID'].apply(
    lambda x: '1' if has_trial_data(x) else '0'
)

# Verify matches
print("Sample PtID mappings:")
sample_ptids = merged_df['PtID'].head(5)
for ptid in sample_ptids:
    print(f"Demographic PtID: {ptid} → CGM patterns: [{ptid}_, 2{int(ptid):03d}_]")

# Save results
merged_df.to_csv(output_path, index=False)
print(f"\nEnhanced demographic data saved to {output_path}")
print("\nTrial participation summary:")
print(merged_df['In_trial'].value_counts())

Sample PtID mappings:
Demographic PtID: 93 → CGM patterns: [93_, 2093_]
Demographic PtID: 93 → CGM patterns: [93_, 2093_]
Demographic PtID: 93 → CGM patterns: [93_, 2093_]
Demographic PtID: 93 → CGM patterns: [93_, 2093_]
Demographic PtID: 93 → CGM patterns: [93_, 2093_]

Enhanced demographic data saved to /content/drive/MyDrive/Digital Health/T1_data_time_trial/Wadwa2023_demographic_with_groups.csv

Trial participation summary:
In_trial
No    370
Name: count, dtype: int64


In [None]:
import pandas as pd
import numpy as np
import os
import zipfile



# Configuration
base_path = '/content/drive/MyDrive/Digital Health/T1_data_time_trial'
demographic_path = f'{base_path}/Brown2019_demographic_with_groups.csv'  # Contains existing HbA1c
trial_zip_path = f'{base_path}/Brown2019.zip'
output_path = f'{base_path}/Brown2019_demographic_with_groups.csv'

df = pd.read_csv(demographic_path)
print(f"Loaded demographic data with {len(df)} subjects")

# Check HbA1c column
if 'hba1c' not in df.columns:
    raise ValueError("HbA1c column not found in demographic data")

# Function to convert PtID to CGM file format
def ptid_to_cgm_patterns(ptid):
    """Generate all possible CGM filename patterns for a PtID"""
    ptid = int(ptid)
    return [
        f"{ptid}_",        # Original format (1_)
        f"2{ptid:03d}_",   # 2xxx format (2001_)
        f"5{ptid:03d}_"    # 5xxx format (5001_)
    ]

# Function to get CGM readings from zip
def get_cgm_readings(ptid):
    """Extract all CGM readings for a patient from zip"""
    patterns = ptid_to_cgm_patterns(ptid)
    readings = []

    try:
        with zipfile.ZipFile(trial_zip_path) as z:
            for pattern in patterns:
                for file in z.namelist():
                    if (file.startswith(pattern) and
                        file.endswith('.npy') and
                        not '_time.npy' in file):
                        with z.open(file) as f:
                            data = np.load(f)
                            valid = data[~np.isnan(data)]
                            readings.extend(valid)
    except Exception as e:
        print(f"Error processing PtID {ptid}: {str(e)}")

    return readings

# Process missing HbA1c
missing_mask = df['hba1c'].isna()
print(f"\nFound {missing_mask.sum()} subjects with missing HbA1c")

if missing_mask.any():
    df['Estimated_hba1c'] = np.nan
    df['CGM_Readings_Used'] = 0

    for idx, row in df[missing_mask].iterrows():
        ptid = row['PtID']
        readings = get_cgm_readings(ptid)

        if readings:
            mean_glucose = np.mean(readings)
            estimated = (mean_glucose + 46.7) / 28.7
            df.at[idx, 'Estimated_hba1c'] = estimated
            df.at[idx, 'CGM_Readings_Used'] = len(readings)
            print(f"PtID {ptid}: Estimated {estimated:.2f}% from {len(readings)} readings")

    # Create final combined column
    df['Final_hba1c'] = df['hba1c'].combine_first(df['Estimated_hba1c'])
else:
    df['Final_hba1c'] = df['hba1c']

# Add clinical groups
def get_hba1c_group(value):
    if pd.isna(value): return 'Unknown'
    value = float(value)
    if value < 7: return '<7%'
    if value < 8.5: return '7-8.5%'
    return '≥8.5%'

df['hba1c_Group'] = df['Final_hba1c'].apply(get_hba1c_group)

# Save results
df.to_csv(output_path, index=False)

# Summary report
print("\nFinal Summary:")
print("=" * 40)
print(f"Subjects with lab HbA1c: {len(df) - missing_mask.sum()}")
print(f"Subjects with estimated HbA1c: {df['Estimated_hba1c'].notna().sum()}")
print(f"Subjects still missing HbA1c: {df['Final_hba1c'].isna().sum()}")
print("\nHbA1c Group Distribution:")
print(df['hba1c_Group'].value_counts(dropna=False))
print(f"\nResults saved to {output_path}")

Loaded demographic data with 1096 subjects

Found 0 subjects with missing HbA1c

Final Summary:
Subjects with lab HbA1c: 1096


KeyError: 'Estimated_hba1c'

In [None]:
import pandas as pd

file_path = '/content/drive/MyDrive/Digital Health/T1_data_time_trial/Wadwa2023_demographic_with_groups.csv'
df = pd.read_csv(file_path)
#df = df.iloc[:, :-4]
df = df.drop(df.columns[-5], axis=1) #5 from behind
df.to_csv('/content/drive/MyDrive/Digital Health/T1_data_time_trial/Wadwa2023_demographic_with_groups.csv', index=False)

In [None]:
#to check t1 granada
import pandas as pd
import numpy as np
import zipfile
from google.colab import drive



# Configuration
base_path = '/content/drive/MyDrive/Digital Health/T1_data_time_trial'
demographic_path = f'{base_path}/T1_granada_demographic.csv'  # Contains 'Value' column with HbA1c
trial_zip_path = f'{base_path}/T1_granada_missing.zip'      # Contains numeric ID CGM files
output_path = f'{base_path}/T1_granada_demographic_with_groups.csv'

# Load demographic data
df = pd.read_csv(demographic_path)
print(f"Loaded data with {len(df)} subjects")
print("Existing HbA1c values:", df['Value'].notna().sum(), "| Missing:", df['Value'].isna().sum())

# Extract numeric ID from Patient_ID (LIB123 → 123)
df['NumericID'] = df['Patient_ID'].str.extract('(\d+)').astype(int)

# Function to estimate HbA1c from CGM
def estimate_from_cgm(numeric_id):
    """Calculate mean glucose from CGM data and convert to HbA1c estimate"""
    readings = []
    try:
        with zipfile.ZipFile(trial_zip_path) as z:
            pattern = f"{numeric_id}_"  # Looks for files like "123_*.npy"
            for file in z.namelist():
                if file.startswith(pattern) and file.endswith('.npy') and not '_time.npy' in file:
                    with z.open(file) as f:
                        data = np.load(f)
                        valid = data[~np.isnan(data)]
                        readings.extend(valid)
    except Exception as e:
        print(f"Error processing {numeric_id}: {str(e)}")

    if readings:
        mean_glucose = np.mean(readings)
        return (mean_glucose + 46.7) / 28.7  # Standard formula
    return np.nan

# Process missing HbA1c values
missing_mask = df['Value'].isna()
print(f"\nEstimating for {missing_mask.sum()} missing values...")

if missing_mask.any():
    df['Estimated_HbA1c'] = np.nan
    for idx, row in df[missing_mask].iterrows():
        numeric_id = row['NumericID']
        estimated = estimate_from_cgm(numeric_id)
        if not np.isnan(estimated):
            df.at[idx, 'Estimated_HbA1c'] = estimated
            print(f"{row['Patient_ID']}: Estimated {estimated:.2f}%")

    # Create final combined column
    df['Final_HbA1c'] = df['Value'].combine_first(df['Estimated_HbA1c'])
else:
    df['Final_HbA1c'] = df['Value']

# Save results (drop temporary NumericID column)
df.drop('NumericID', axis=1).to_csv(output_path, index=False)

# Final report
print("\nResults Summary:")
print("=" * 40)
print(f"Original HbA1c values: {df['Value'].notna().sum()}")
print(f"Estimated HbA1c values: {df['Estimated_HbA1c'].notna().sum()}")
print(f"Still missing: {df['Final_HbA1c'].isna().sum()}")
print(f"\nSaved complete data to: {output_path}")

Loaded data with 451 subjects


KeyError: 'Value'

In [None]:
#for checking tamborlane id 303
import pandas as pd
import zipfile
from google.colab import drive


# Configuration
base_path = '/content/drive/MyDrive/Digital Health/T1_data_time_trial'
cgm_zip_path = f'{base_path}/Tamborlane2008.zip'
subject_id = 303  # The ID you want to check



# 2. Check in CGM zip file
print("\nChecking CGM data...")
try:
    with zipfile.ZipFile(cgm_zip_path) as z:
        # Look for files with pattern "303_*.npy" or "LIB303_*.npy"
        cgm_files = [f for f in z.namelist()
                    if (f.startswith(f'{subject_id}_') or
                        f.startswith(f'LIB{subject_id}_'))
                    and f.endswith('.npy')
                    and not '_time.npy' in f]

        print(f"CGM files found: {len(cgm_files)}")
        if cgm_files:
            print("Matching CGM files:")
            for file in cgm_files[:3]:  # Show first 3 files if many exist
                print(f" - {file}")
            if len(cgm_files) > 3:
                print(f" - ...and {len(cgm_files)-3} more")
except Exception as e:
    print(f"Error reading CGM zip: {str(e)}")

print("\nCheck complete.")


Checking CGM data...
CGM files found: 2
Matching CGM files:
 - 303_1.npy
 - 303_0.npy

Check complete.


In [None]:
#tamborlane hba1c fill
import pandas as pd
import numpy as np
import zipfile



# Configuration
base_path = '/content/drive/MyDrive/Digital Health/T1_data_time_trial'
demographic_path = f'{base_path}/Tamborlane2008_demographic.csv'
cgm_zip_path = f'{base_path}/Tamborlane2008.zip'
output_path = f'{base_path}/Tamborlane2008_demographic_update.csv'
target_ptid = 303  # The specific PtID we're checking

# Load demographic data
df = pd.read_csv(demographic_path)

# Function to calculate HbA1c from CGM data
def calculate_hba1c_from_cgm(ptid):
    """Calculate HbA1c from CGM data using standard formula"""
    glucose_readings = []
    try:
        with zipfile.ZipFile(cgm_zip_path) as z:
            # Check both possible ID formats
            patterns = [f"{ptid}_", f"LIB{ptid}_"]

            for pattern in patterns:
                for file in z.namelist():
                    if (file.startswith(pattern) and
                        file.endswith('.npy') and
                        not '_time.npy' in file):
                        with z.open(file) as f:
                            data = np.load(f)
                            valid_values = data[~np.isnan(data)]
                            glucose_readings.extend(valid_values)

        if glucose_readings:
            mean_glucose = np.mean(glucose_readings)
            return (mean_glucose + 46.7) / 28.7  # Standard formula
        return np.nan
    except Exception as e:
        print(f"Error processing PtID {ptid}: {str(e)}")
        return np.nan

# Check for PtID 303 with missing LabA1cResult but having CGM data
def process_ptid_303():
    # Find the specific patient record
    ptid_mask = (df['PtID'].astype(str) == str(target_ptid)) | \
                (df['PtID'].astype(str) == f'LIB{target_ptid}')

    if not ptid_mask.any():
        print(f"PtID {target_ptid} not found in demographic data")
        return

    patient_data = df[ptid_mask]
    print(f"Found PtID {target_ptid} in demographic data")

    # Check if LabA1cResult is missing
    if patient_data['LabA1cResult'].isna().any():
        print("LabA1cResult is missing - checking for CGM data...")

        # Calculate HbA1c from CGM
        estimated_hba1c = calculate_hba1c_from_cgm(target_ptid)

        if not np.isnan(estimated_hba1c):
            print(f"Estimated HbA1c: {estimated_hba1c:.2f}%")

            # Update the dataframe
            idx = patient_data.index[0]
            df.at[idx, 'LabA1cResult'] = estimated_hba1c

            # Update a1c_mean_group based on the new value
            if estimated_hba1c < 7:
                df.at[idx, 'a1c_mean_group'] = '<7%'
            elif estimated_hba1c < 8.5:
                df.at[idx, 'a1c_mean_group'] = '7-8.5%'
            else:
                df.at[idx, 'a1c_mean_group'] = '≥8.5%'

            print("Successfully updated LabA1cResult and a1c_mean_group")
        else:
            print("No CGM data available for estimation")
    else:
        print(f"LabA1cResult exists: {patient_data['LabA1cResult'].values[0]}%")

# Execute the processing
process_ptid_303()

# Save the updated dataframe
df.to_csv(output_path, index=False)
print(f"\nUpdated demographic data saved to: {output_path}")

Found PtID 303 in demographic data
LabA1cResult is missing - checking for CGM data...
Estimated HbA1c: 6.86%
Successfully updated LabA1cResult and a1c_mean_group

Updated demographic data saved to: /content/drive/MyDrive/Digital Health/T1_data_time_trial/Tamborlane2008_demographic_update.csv


In [None]:
import pandas as pd
import numpy as np
import zipfile
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Configuration
base_path = '/content/drive/MyDrive/Digital Health/T1_data_time_trial'
demographic_path = f'{base_path}/brown2019_demographics.csv'
hba1c_path = f'{base_path}/brown2019_hba1c.csv'
cgm_zip_path = f'{base_path}/Brown2019.zip'
output_path = f'{base_path}/brown_demographic_with_avg_hba1c.csv'

# Load data
demo_df = pd.read_csv(demographic_path)
hba1c_df = pd.read_csv(hba1c_path)

# 1. Calculate average HbA1c per PtID
print("Calculating average HbA1c from hba1c file...")
avg_hba1c = hba1c_df.groupby('PtID')['hba1c'].mean().reset_index()
avg_hba1c.columns = ['PtID', 'avg_hba1c']

# 2. Merge with demographic data
print("\nMerging with demographic data...")
merged_df = pd.merge(
    demo_df,
    avg_hba1c,
    on='PtID',
    how='left'
)

# 3. Add clinical groupings
def get_hba1c_group(value):
    if pd.isna(value): return 'Unknown'
    value = float(value)
    if value < 7: return '<7%'
    if value < 8.5: return '7-8.5%'
    return '≥8.5%'

merged_df['hba1c_Group'] = merged_df['avg_hba1c'].apply(get_hba1c_group)

def get_age_group(age):
    if pd.isna(age): return 'Unknown'
    age = float(age)
    if age < 10: return '<10'
    if age < 20: return '10-19'
    if age < 65: return '20-64'
    return '≥65'

merged_df['age_group'] = merged_df['AgeAtEnrollment'].apply(get_age_group)

# 4. Check CGM Participation - Updated for 5xxx.0_x.npy format
def check_cgm_participation(zip_file):
    """Extract IDs from filenames in format 5xxx.0_x.npy"""
    cgm_ids = set()
    for file in zip_file.namelist():
        if file.endswith('.npy') and not '_time.npy' in file:
            try:
                # Extract base filename (e.g., "5001.0_4.npy" → "5001")
                base_name = file.split('/')[-1]  # Handle subdirectories
                ptid = int(base_name.split('.')[0])  # Get the integer part before first dot
                cgm_ids.add(ptid)
            except (ValueError, IndexError):
                print(f"Couldn't parse ID from: {file}")
                continue
    return cgm_ids

print("\nChecking CGM participation...")
with zipfile.ZipFile(cgm_zip_path) as z:
    cgm_participants = check_cgm_participation(z)
    print(f"Found {len(cgm_participants)} unique subjects with CGM data")
    merged_df['has_cgm'] = merged_df['PtID'].isin(cgm_participants)

    # Print verification info
    print("\nFirst 5 CGM files found:")
    for file in z.namelist()[:5]:
        print(file)

# Identify missing subjects
missing_cgm = merged_df[~merged_df['has_cgm']]['PtID'].unique()
print(f"\nSubjects without CGM data ({len(missing_cgm)}): {sorted(missing_cgm)}")

# Verify with sample
sample_check = merged_df[['PtID', 'has_cgm']].sample(5)
print("\nSample verification:")
print(sample_check)

# Summary report
print("\nFinal Summary:")
print("=" * 40)
print(f"Total subjects: {len(merged_df)}")
print(f"Subjects with HbA1c data: {merged_df['avg_hba1c'].notna().sum()}")
print(f"Subjects with CGM data: {merged_df['has_cgm'].sum()}")
print("\nHbA1c Group Distribution:")
print(merged_df['hba1c_Group'].value_counts(dropna=False))
print("\nAge Group Distribution:")
print(merged_df['age_group'].value_counts(dropna=False))

# Save results
merged_df.to_csv(output_path, index=False)
print(f"\nSaved to: {output_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Calculating average HbA1c from hba1c file...

Merging with demographic data...

Checking CGM participation...
Found 167 unique subjects with CGM data

First 5 CGM files found:
5050.0_13.npy
5140.0_5.npy
5045.0_5.npy
5164.0_1.npy
5095.0_4.npy

Subjects without CGM data (3): [np.int64(5033), np.int64(5068), np.int64(5084)]

Sample verification:
     PtID  has_cgm
107  5109     True
1    5002     True
67   5069     True
56   5058     True
27   5029     True

Final Summary:
Total subjects: 170
Subjects with HbA1c data: 170
Subjects with CGM data: 167

HbA1c Group Distribution:
hba1c_Group
7-8.5%    96
<7%       64
≥8.5%     10
Name: count, dtype: int64

Age Group Distribution:
age_group
20-64    116
10-19     51
≥65        3
Name: count, dtype: int64

Saved to: /content/drive/MyDrive/Digital Health/T1_data_time_trial/brown_demographic_with_avg_hba1c.csv


In [None]:
import pandas as pd
import numpy as np
import zipfile

# Configuration
base_path = '/content/drive/MyDrive/Digital Health/T1_data_time_trial'
demographic_path = f'{base_path}/wadwa2023_demographic.csv'
hba1c_path = f'{base_path}/Wadwa_hba1c.csv'
cgm_zip_path = f'{base_path}/wadwa2023.zip'
output_path = f'{base_path}/wadwa_demographic_with_avg_hba1c.csv'

# Load data
demo_df = pd.read_csv(demographic_path)
hba1c_df = pd.read_csv(hba1c_path)

def calculate_hba1c_from_cgm(ptid):
    """Calculate HbA1c from CGM data - handles both '101.npy' and '101_2.npy' formats"""
    glucose_readings = []
    try:
        with zipfile.ZipFile(cgm_zip_path) as z:
            # Try both filename patterns:
            patterns = [f"{ptid}.npy", f"{ptid}_"]

            for pattern in patterns:
                for file in z.namelist():
                    if (file.startswith(pattern) and file.endswith('.npy')) and not '_time.npy' in file:
                        with z.open(file) as f:
                            data = np.load(f)
                            valid_values = data[~np.isnan(data)]
                            glucose_readings.extend(valid_values)

        if glucose_readings:
            mean_glucose = np.mean(glucose_readings)
            return (mean_glucose + 46.7) / 28.7
        return np.nan
    except Exception as e:
        print(f"Error processing PtID {ptid}: {str(e)}")
        return np.nan

def check_cgm_participation(zip_file):
    """Updated to match both ID formats"""
    cgm_ids = set()
    for file in zip_file.namelist():
        if file.endswith('.npy') and not '_time.npy' in file:
            try:
                # Extract ID from either "101.npy" or "101_2.npy"
                base = file.split('/')[-1].split('_')[0].replace('.npy','')
                cgm_ids.add(int(base))
            except ValueError:
                continue
    return cgm_ids

# 1. Process individual HbA1c measurements
print("Processing individual HbA1c measurements...")
filled_hba1c_df = hba1c_df.copy()
missing_measurements = filled_hba1c_df[filled_hba1c_df['hba1c'].isna()]

print(f"\nFound {len(missing_measurements)} missing measurements for these IDs:")
print(missing_measurements['PtID'].unique())

# Track which IDs we filled
filled_from_cgm = []
still_missing = []

for idx, row in missing_measurements.iterrows():
    ptid = row['PtID']
    cgm_hba1c = calculate_hba1c_from_cgm(ptid)
    if not np.isnan(cgm_hba1c):
        filled_hba1c_df.at[idx, 'hba1c'] = cgm_hba1c
        filled_from_cgm.append(ptid)
    else:
        still_missing.append(ptid)

print("\nSuccessfully filled HbA1c from CGM for these IDs:")
print(np.unique(filled_from_cgm))

print("\nStill missing HbA1c (no CGM data available) for these IDs:")
print(np.unique(still_missing))

# 2. Calculate average HbA1c per subject
print("\nCalculating average HbA1c per subject...")
avg_hba1c = filled_hba1c_df.groupby('PtID')['hba1c'].mean().reset_index()
avg_hba1c.columns = ['PtID', 'avg_hba1c']

# Identify subjects still missing HbA1c
missing_subjects = avg_hba1c[avg_hba1c['avg_hba1c'].isna()]['PtID'].unique()
print(f"\nSubjects with no HbA1c data at all: {missing_subjects}")

# [Rest of the merging, grouping and saving code remains the same]

# 3. Identify subjects still missing HbA1c (no measurements at all)
missing_subjects = avg_hba1c[avg_hba1c['avg_hba1c'].isna()]['PtID'].unique()
print(f"\nFound {len(missing_subjects)} subjects with no HbA1c data")

# 4. Calculate HbA1c from CGM for completely missing subjects
print("\nCalculating HbA1c for subjects with no measurements...")
for ptid in missing_subjects:
    cgm_hba1c = calculate_hba1c_from_cgm(ptid)
    if not np.isnan(cgm_hba1c):
        # Add new row with CGM-estimated value
        avg_hba1c.loc[len(avg_hba1c)] = [ptid, cgm_hba1c]
        print(f"Estimated HbA1c for PtID {ptid}: {cgm_hba1c:.2f}%")

# [Previous code remains the same until the missing measurements section]

# 1. Process individual HbA1c measurements
print("Processing individual HbA1c measurements...")

# Create a copy to store filled values
filled_hba1c_df = hba1c_df.copy()

# Identify missing individual measurements
missing_measurements = filled_hba1c_df[filled_hba1c_df['hba1c'].isna()]
print(f"\nFound {len(missing_measurements)} missing individual HbA1c measurements for these subjects:")
print(missing_measurements[['PtID', 'hba1c']].to_string(index=False))  # Show PtIDs with missing values

# [Rest of the code remains the same]

# 5. Merge with demographic data
print("\nMerging with demographic data...")
merged_df = pd.merge(
    demo_df,
    avg_hba1c,
    on='PtID',
    how='left'
)

# 6. Add clinical groupings
def get_hba1c_group(value):
    if pd.isna(value): return 'Unknown'
    value = float(value)
    if value < 7: return '<7%'
    if value < 8.5: return '7-8.5%'
    return '≥8.5%'

merged_df['hba1c_Group'] = merged_df['avg_hba1c'].apply(get_hba1c_group)

def get_age_group(age):
    if pd.isna(age): return 'Unknown'
    age = float(age)
    if age < 10: return '<10'
    if age < 20: return '10-19'
    if age < 65: return '20-64'
    return '≥65'

merged_df['age_group'] = merged_df['AgeAsofEnrollDt'].apply(get_age_group)

# 7. Check CGM Participation
def check_cgm_participation(zip_file):
    """Extract IDs from filenames in format 5xxx.0_x.npy"""
    cgm_ids = set()
    for file in zip_file.namelist():
        if file.endswith('.npy') and not '_time.npy' in file:
            try:
                base_name = file.split('/')[-1]
                ptid = int(base_name.split('.')[0])
                cgm_ids.add(ptid)
            except (ValueError, IndexError):
                continue
    return cgm_ids

print("\nChecking CGM participation...")
with zipfile.ZipFile(cgm_zip_path) as z:
    cgm_participants = check_cgm_participation(z)
    print(f"Found {len(cgm_participants)} unique subjects with CGM data")
    merged_df['has_cgm'] = merged_df['PtID'].isin(cgm_participants)

# Final Summary
print("\nFinal Summary:")
print("=" * 40)
print(f"Total subjects: {len(merged_df)}")
print(f"Subjects with original HbA1c measurements: {len(hba1c_df['PtID'].unique())}")
print(f"Missing measurements filled from CGM: {len(missing_measurements) - filled_hba1c_df['hba1c'].isna().sum()}")
print(f"Subjects with CGM-estimated HbA1c (no measurements): {len(missing_subjects) - merged_df['avg_hba1c'].isna().sum()}")
print(f"Subjects still missing HbA1c: {merged_df['avg_hba1c'].isna().sum()}")
print("\nHbA1c Group Distribution:")
print(merged_df['hba1c_Group'].value_counts(dropna=False))
print("\nAge Group Distribution:")
print(merged_df['age_group'].value_counts(dropna=False))

# Save results
merged_df.to_csv(output_path, index=False)
print(f"\nSaved to: {output_path}")

Processing individual HbA1c measurements...

Found 19 missing measurements for these IDs:
[ 50  47  59  77  81  91  82  44  15  40  37  65  11  19  68 101  97]

Successfully filled HbA1c from CGM for these IDs:
[]

Still missing HbA1c (no CGM data available) for these IDs:
[ 11  15  19  37  40  44  47  50  59  65  68  77  81  82  91  97 101]

Calculating average HbA1c per subject...

Subjects with no HbA1c data at all: []

Found 0 subjects with no HbA1c data

Calculating HbA1c for subjects with no measurements...
Processing individual HbA1c measurements...

Found 19 missing individual HbA1c measurements for these subjects:
 PtID  hba1c
   50    NaN
   47    NaN
   59    NaN
   77    NaN
   81    NaN
   91    NaN
   82    NaN
   44    NaN
   15    NaN
   40    NaN
   37    NaN
   65    NaN
   11    NaN
   65    NaN
   19    NaN
   68    NaN
  101    NaN
   65    NaN
   97    NaN

Merging with demographic data...

Checking CGM participation...
Found 861 unique subjects with CGM data

Fin

In [11]:
#t1 missing hbaqc val and id
import pandas as pd
import numpy as np
import zipfile


# Configuration
base_path = '/content/drive/MyDrive/Digital Health/T1_data_time_trial'
demographic_path = f'{base_path}/T1_granada_demographic_with_groups.csv'
cgm_zip_path = f'{base_path}/T1_granada.zip'
missing_zip_path = f'{base_path}/T1_granada_missing (1).zip'
time_zip_path = f'{base_path}/T1_granada_time.zip'
output_path = f'{base_path}/T1_granada_demographic_with_groups.csv'

# Load demographic data
print("Loading demographic data...")
demo_df = pd.read_csv(demographic_path)

# Identify patients with missing HbA1c
missing_hba1c_patients = demo_df[demo_df['Value'].isna()]['Patient_ID'].unique()
print(f"\nFound {len(missing_hba1c_patients)} patients with missing HbA1c:")
print(missing_hba1c_patients)

# Function to examine ZIP file contents
def examine_zip(zip_path):
    print(f"\nExamining {zip_path.split('/')[-1]} contents:")
    with zipfile.ZipFile(zip_path) as z:
        for i, file in enumerate(z.namelist()[:5]):
            print(f"  {file}")
        print(f"  ... (showing first 5 of {len(z.namelist())} files)")

# Examine file formats
examine_zip(cgm_zip_path)
examine_zip(missing_zip_path)
examine_zip(time_zip_path)

# Function to calculate HbA1c from CGM
import os

def calculate_hba1c_from_cgm(ptid):
    """Calculate HbA1c from CGM data using standard formula"""
    glucose_readings = []

    # Extract numeric part of patient ID if prefixed
    numeric_id = ''.join(filter(str.isdigit, ptid))

    # Patterns to match filenames
    possible_ids = [ptid, numeric_id, f"GR{numeric_id}"]

    for zip_path in [cgm_zip_path, missing_zip_path]:
        try:
            with zipfile.ZipFile(zip_path) as z:
                for file in z.namelist():
                    basename = os.path.basename(file)
                    if basename.endswith('.npy') and '_time.npy' not in basename:
                        for pid in possible_ids:
                            if basename.startswith(pid):
                                with z.open(file) as f:
                                    data = np.load(f)
                                    valid_values = data[~np.isnan(data)]
                                    glucose_readings.extend(valid_values)
                                break  # No need to check other patterns if one matched
        except Exception as e:
            print(f"Error reading {zip_path} for PtID {ptid}: {str(e)}")

    if glucose_readings:
        mean_glucose = np.mean(glucose_readings)
        estimated_hba1c = (mean_glucose + 46.7) / 28.7
        print(f"  Calculated HbA1c {estimated_hba1c:.2f}% for Patient_ID {ptid}")
        return estimated_hba1c

    return np.nan



# Process missing HbA1c values
print("\nProcessing missing HbA1c values...")
filled_count = 0

for ptid in missing_hba1c_patients:
    print(f"\nProcessing Patient_ID {ptid}:")
    hba1c = calculate_hba1c_from_cgm(ptid)

    if not np.isnan(hba1c):
        demo_df.loc[demo_df['Patient_ID'] == ptid, 'Value'] = hba1c
        demo_df.loc[demo_df['Patient_ID'] == ptid, 'hba1c_source'] = 'CGM-estimated'
        filled_count += 1
        print(f"  Successfully filled missing HbA1c")
    else:
        print(f"  Could not calculate HbA1c - no CGM data available")

# Add clinical groupings
def get_hba1c_group(value):
    if pd.isna(value): return 'Unknown'
    value = float(value)
    if value < 7: return '<7%'
    if value < 8.5: return '7-8.5%'
    return '≥8.5%'

demo_df['a1c_mean_group'] = demo_df['Value'].apply(get_hba1c_group)

# Final summary
print("\nFinal Summary:")
print("="*50)
print(f"Total patients: {len(demo_df)}")
print(f"Patients with missing HbA1c: {len(missing_hba1c_patients)}")
print(f"Successfully filled from CGM: {filled_count}")
print(f"Still missing: {len(missing_hba1c_patients) - filled_count}")
print("\nHbA1c Group Distribution:")
print(demo_df['a1c_mean_group'].value_counts(dropna=False))

# Save results
demo_df.to_csv(output_path, index=False)
print(f"\nSaved results to: {output_path}")

Loading demographic data...

Found 16 patients with missing HbA1c:
['LIB193263' 'LIB193264' 'LIB193269' 'LIB193280' 'LIB193511' 'LIB193521'
 'LIB193536' 'LIB193650' 'LIB193764' 'LIB193769' 'LIB193801' 'LIB193806'
 'LIB193812' 'LIB193835' 'LIB193865' 'LIB194063']

Examining T1_granada.zip contents:
  T1_granada/193263_0.npy
  T1_granada/193263_1.npy
  T1_granada/193263_10.npy
  T1_granada/193263_11.npy
  T1_granada/193263_12.npy
  ... (showing first 5 of 13680 files)

Examining T1_granada_missing (1).zip contents:
  T1_granada_missing/LIB193263.csv
  T1_granada_missing/LIB193266.csv
  T1_granada_missing/LIB193267.csv
  T1_granada_missing/LIB193269.csv
  T1_granada_missing/LIB193272.csv
  ... (showing first 5 of 594 files)

Examining T1_granada_time.zip contents:
  T1_granada_time/193263_0_time.npy
  T1_granada_time/193263_1_time.npy
  T1_granada_time/193263_10_time.npy
  T1_granada_time/193263_11_time.npy
  T1_granada_time/193263_12_time.npy
  ... (showing first 5 of 13680 files)

Proce

In [34]:
import pandas as pd
import numpy as np
import zipfile
import os

# Configuration
base_path = '/content/drive/MyDrive/Digital Health/T1_data_time_trial'
demographic_path = f'{base_path}/T1_granada_demographic_with_groups.csv'
cgm_zip_path = f'{base_path}/T1_granada.zip'
missing_zip_path = f'{base_path}/T1_granada_missing (1).zip'
time_zip_path = f'{base_path}/T1_granada_time.zip'
output_path = f'{base_path}/T1_granada_demographic_with_groups.csv'

csv_file_name = '193264_0.npy'

with zipfile.ZipFile(time_zip_path) as z:
    # Handle any prefix path
    matching_file = next((f for f in z.namelist() if f.endswith(csv_file_name)), None)

    if matching_file:
        with z.open(matching_file) as f:
            df = np.load(f)
            print(f"\nContents of {matching_file}:")
            print(df)  # Show first 10 rows
    else:
        print(f"File {csv_file_name} not found in ZIP.")

File 193264_0.npy not found in ZIP.


3