In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

file_path = '/content/drive/MyDrive/Digital Health/Healthy_data_time_trial/Shah_demographic_with_groups.csv'
df = pd.read_csv(file_path)
#df = df.iloc[:, :-4]
df = df.drop(df.columns[-2], axis=1)
df.to_csv('/content/drive/MyDrive/Digital Health/Healthy_data_time_trial/Shah_demographic_with_groups.csv', index=False)


In [None]:
import pandas as pd
import numpy as np
import os
from google.colab import drive

# Mount Google Drive
#drive.mount('/content/drive')

# Configuration
base_path = '/content/drive/MyDrive/Digital Health/Healthy_data_time_trial'
demographic_path = f'{base_path}/Shah2019_demographic.csv'  # Your main demographic file
output_path = f'{base_path}/Shah_demographic_with_groups.csv'  # Enhanced output file
trial_name = 'Shah2019'  # Change this to match your trial name
cgm_folder = f'{base_path}/{trial_name}'  # Folder containing CGM files

# =============================================
# STEP 1: Load demographic data
# =============================================

# Load data while preserving original columns
df = pd.read_csv(demographic_path)
print("Original columns:", df.columns.tolist())

# =============================================
# STEP 2: Add grouping columns
# =============================================

# 1. Age Group (new column)
def get_age_group(age):
    if pd.isna(age):
        return 'Unknown'
    try:
        age = float(age)
        if age < 10: return '<10'
        if age < 20: return '10-19'
        if age < 65: return '20-64'
        return '≥65'
    except:
        return 'Unknown'

# Find age column (case insensitive)
age_col = next((col for col in df.columns if col.lower() == 'age'), None)
if age_col:
    df['Age_group'] = df[age_col].apply(get_age_group)
else:
    print("Warning: No age column found")
    df['Age_group'] = 'Unknown'

# 2. Sex/Gender (use existing column if available)
sex_col = next((col for col in df.columns if col.lower() in ['sex', 'gender']), None)
if not sex_col:
    print("Warning: No sex/gender column found")
    df['Sex'] = 'Unknown'  # Add default column if missing

# 3. Trial Participation (new column)
def has_trial_data(ptid):
    """Check if patient has CGM data files"""
    try:
        # Convert PtID to CGM file format (9001 for PtID 1)
        cgm_id = f"9{int(ptid):03d}"
        # Check for any CGM files for this patient
        files = [f for f in os.listdir(cgm_folder)
                if f.startswith(f"{cgm_id}_")
                and f.endswith('.npy')
                and not f.endswith('_time.npy')]
        return 'Yes' if files else 'No'
    except:
        return 'Unknown'

df['Has_CGM_Data'] = df['PtID'].apply(has_trial_data)

# =============================================
# STEP 3: Save enhanced demographic file
# =============================================

df.to_csv(output_path, index=False)
print(f"\nEnhanced demographic file saved to: {output_path}")
print("New columns added:", ['Age_group', 'Has_CGM_Data'])

# Show sample of results
print("\nSample of enhanced data:")
print(df[['PtID', age_col if age_col else '', sex_col if sex_col else '',
          'Age_group', 'Has_CGM_Data']].head().to_string())

Original columns: ['PtID', 'PtStatus', 'AgeAsOfEnrollDt', 'Gender', 'age_group', 'file_name', 'filter_in', 'HbA1c', 'Estimated_HbA1c', 'CGM_Readings_Used', 'Final_HbA1c']

Enhanced demographic file saved to: /content/drive/MyDrive/Digital Health/Healthy_data_time_trial/Shah_demographic_with_groups.csv
New columns added: ['Age_group', 'Has_CGM_Data']

Sample of enhanced data:


KeyError: "[''] not in index"

In [None]:
import os
import numpy as np
import pandas as pd

# Configuration
demographic_path = '/content/drive/MyDrive/Digital Health/Healthy_data_time_trial/Shah2019_demographic.csv'
hba1c_path = '/content/drive/MyDrive/Digital Health/Healthy_data_time_trial/Shah2019_hba1c.csv'
cgm_folder = '/content/drive/MyDrive/Digital Health/Healthy_data_time_trial/Shah2019.zip'  # Specific CGM folder
output_path = '/content/drive/MyDrive/Digital Health/Healthy_data_time_trial/Shah2019_demographic_hba1c.csv'

# =============================================
# ID MAPPING SYSTEM (Adjust as needed)
# =============================================
def cgm_to_ptid(cgm_id):
    """Convert CGM filename ID to demographic PtID"""
    # Example: 9001 → 1 (for Shah2019 dataset)
    return int(str(cgm_id)[-3:])  # Takes last 3 digits

def ptid_to_cgm(ptid):
    """Convert demographic PtID to CGM filename pattern"""
    # Example: 1 → 9001 (for Shah2019 dataset)
    return f"9{ptid:03d}"  # Formats as 9 followed by 3-digit ID

# =============================================
# STEP 1: Load and merge data
# =============================================
demo_df = pd.read_csv(demographic_path)
hba1c_df = pd.read_csv(hba1c_path)

merged_df =
pd.merge(
    demo_df,
    hba1c_df[['PtID', 'HbA1c']],
    on='PtID',
    how='left'
)

# =============================================
# STEP 2: Estimate missing HbA1c from CGM data
# =============================================
def get_cgm_readings(ptid):
    """Get all CGM readings for a patient"""
    cgm_id = ptid_to_cgm(ptid)
    readings = []

    for f in os.listdir(cgm_folder):
        if not f.startswith(f"{cgm_id}_") or not f.endswith('.npy') or '_time.npy' in f:
            continue

        try:
            data = np.load(os.path.join(cgm_folder, f))
            readings.extend(data[~np.isnan(data)])
        except Exception as e:
            print(f"  Warning: Error loading {f} - {str(e)}")

    return readings

missing_mask = merged_df['HbA1c'].isna()
print(f"Found {missing_mask.sum()} subjects with missing HbA1c")

if missing_mask.any():
    print("Estimating HbA1c from CGM data...")
    estimated_count = 0

    for idx, row in merged_df[missing_mask].iterrows():
        ptid = row['PtID']
        readings = get_cgm_readings(ptid)

        if len(readings) > 0:
            mean_glucose = np.mean(readings)
            estimated_hba1c = (mean_glucose + 46.7) / 28.7
            merged_df.at[idx, 'Estimated_HbA1c'] = estimated_hba1c
            merged_df.at[idx, 'CGM_Readings_Used'] = len(readings)
            estimated_count += 1
            print(f"  PtID {ptid}: Estimated HbA1c = {estimated_hba1c:.2f}% (from {len(readings)} readings)")
        else:
            print(f"  PtID {ptid}: No CGM data found")

    merged_df['Final_HbA1c'] = merged_df['HbA1c'].combine_first(merged_df['Estimated_HbA1c'])
    print(f"\nSuccessfully estimated HbA1c for {estimated_count} subjects")
else:
    merged_df['Final_HbA1c'] = merged_df['HbA1c']
    print("No missing HbA1c values found")

# =============================================
# STEP 3: Save results
# =============================================
output_cols = list(demo_df.columns) + ['HbA1c', 'Estimated_HbA1c', 'CGM_Readings_Used', 'Final_HbA1c']
merged_df[output_cols].to_csv(output_path, index=False)

# =============================================
# STEP 4: Generate report
# =============================================
print("\nFinal Report:")
print("=" * 50)
print(f"Total subjects: {len(merged_df)}")
print(f"Subjects with lab HbA1c: {len(hba1c_df)}")
print(f"Subjects with estimated HbA1c: {merged_df['Estimated_HbA1c'].notna().sum()}")
print(f"Subjects still missing HbA1c: {merged_df['Final_HbA1c'].isna().sum()}")
print("\nSample records:")
print(merged_df[['PtID', 'HbA1c', 'Estimated_HbA1c', 'Final_HbA1c']].head().to_string())

SyntaxError: invalid syntax (<ipython-input-3-6f2597ec0896>, line 30)

In [None]:
import pandas as pd

# Load your enhanced demographic data
df = pd.read_csv('/content/drive/MyDrive/Digital Health/Healthy_data_time_trial/Colas2019_demographic.csv')

# Define HbA1c groups
def categorize_hba1c(value):
    if pd.isna(value):
        return 'Unknown'
    value = float(value)
    if value < 7: return '<7%'
    if value < 8.5: return '7-8.5%'
    return '≥8.5%'

# Create the grouping column
df['HbA1c_Group'] = df['Final_HbA1c'].apply(categorize_hba1c)

# Group statistics (count, mean HbA1c per group)
group_stats = df.groupby('HbA1c_Group')['Final_HbA1c'].agg(['count', 'mean', 'std']).reset_index()
group_stats.columns = ['HbA1c Group', 'Patient Count', 'Mean HbA1c', 'Std Deviation']

# Save results with new grouping column
output_path = '/content/drive/MyDrive/Digital Health/Healthy_data_time_trial/Colas2019_demographic.csv'
df.to_csv(output_path, index=False)

# Display results
print("HbA1c Group Distribution:")
print("=" * 40)
print(group_stats)
print(f"\nSaved data with groups to: {output_path}")

HbA1c Group Distribution:
  HbA1c Group  Patient Count  Mean HbA1c  Std Deviation
0         <7%            207    5.746005       0.317805
1     Unknown              0         NaN            NaN

Saved data with groups to: /content/drive/MyDrive/Digital Health/Healthy_data_time_trial/Colas2019_demographic.csv


In [None]:
#shah
import pandas as pd
import numpy as np
import zipfile

# Configuration
base_path = '/content/drive/MyDrive/Digital Health/Healthy_data_time_trial'
demographic_path = f'{base_path}/Shah2019_demographic.csv'
hba1c_path = f'{base_path}/Shah2019_hba1c.csv'
cgm_zip_path = f'{base_path}/Shah2019.zip'
output_path = f'{base_path}/Shah_demographic_with_avg_hba1c.csv'

# Load data
demo_df = pd.read_csv(demographic_path)
hba1c_df = pd.read_csv(hba1c_path)

# 1. Calculate average HbA1c per PtID
print("Calculating average HbA1c from hba1c file...")
avg_hba1c = hba1c_df.groupby('PtID')['HbA1c'].mean().reset_index()
avg_hba1c.columns = ['PtID', 'avg_hba1c']

# 2. Merge with demographic data
print("\nMerging with demographic data...")
merged_df = pd.merge(
    demo_df,
    avg_hba1c,
    on='PtID',
    how='left'
)

# Initialize cgm_estimated_hba1c column
merged_df['cgm_estimated_hba1c'] = np.nan

# 3. Add clinical groupings
def get_hba1c_group(value):
    if pd.isna(value): return 'Unknown'
    value = float(value)
    if value < 7: return '<7%'
    if value < 8.5: return '7-8.5%'
    return '≥8.5%'

merged_df['hba1c_Group'] = merged_df['avg_hba1c'].apply(get_hba1c_group)

def get_age_group(age):
    if pd.isna(age): return 'Unknown'
    age = float(age)
    if age < 10: return '<10'
    if age < 20: return '10-19'
    if age < 65: return '20-64'
    return '≥65'

merged_df['age_group'] = merged_df['AgeAsOfEnrollDt'].apply(get_age_group)

# 4. Check CGM Participation
def check_cgm_participation(zip_file):
    """Check which subjects have CGM data (handles id.0_x.npy format)"""
    cgm_ids = set()
    for file in zip_file.namelist():
        if file.endswith('.npy') and not '_time.npy' in file:
            try:
                # Extract ID from filename in format "id.0_x.npy"
                base_name = file.split('/')[-1]
                ptid = int(base_name.split('.')[0])
                cgm_ids.add(ptid)
            except (ValueError, IndexError):
                continue
    return cgm_ids

print("\nChecking CGM participation...")
with zipfile.ZipFile(cgm_zip_path) as z:
    # Print sample files to verify format
    print("Sample CGM files:")
    for file in z.namelist()[:5]:
        print(file)

    cgm_participants = check_cgm_participation(z)
    print(f"\nFound {len(cgm_participants)} subjects with CGM data")
    merged_df['has_cgm'] = merged_df['PtID'].isin(cgm_participants)

# Check status of subjects missing CGM data
missing_cgm = merged_df[~merged_df['has_cgm']]
print("\nSubjects missing CGM data by status:")
print(missing_cgm['PtStatus'].value_counts(dropna=False))

# 5. Calculate HbA1c from CGM where lab values are missing
def calculate_hba1c_from_cgm(ptid):
    """Calculate HbA1c from CGM data using standard formula"""
    glucose_readings = []
    try:
        with zipfile.ZipFile(cgm_zip_path) as z:
            pattern = f"{ptid}.0_"
            for file in z.namelist():
                if file.startswith(pattern) and file.endswith('.npy') and not '_time.npy' in file:
                    with z.open(file) as f:
                        data = np.load(f)
                        valid_values = data[~np.isnan(data)]
                        glucose_readings.extend(valid_values)

        if glucose_readings:
            mean_glucose = np.mean(glucose_readings)
            return (mean_glucose + 46.7) / 28.7
        return np.nan
    except Exception as e:
        print(f"Error processing PtID {ptid}: {str(e)}")
        return np.nan

print("\nCalculating CGM-based HbA1c for eligible subjects...")
eligible_pts = merged_df[
    (merged_df['avg_hba1c'].isna()) &
    (merged_df['has_cgm']) &
    (merged_df['PtStatus'] != 'Dropped')  # Changed to match your actual status values
]

print(f"Found {len(eligible_pts)} eligible subjects")

# Calculate and store CGM-based HbA1c
for _, row in eligible_pts.iterrows():
    ptid = row['PtID']
    cgm_hba1c = calculate_hba1c_from_cgm(ptid)
    if not pd.isna(cgm_hba1c):
        merged_df.loc[merged_df['PtID'] == ptid, 'cgm_estimated_hba1c'] = cgm_hba1c
        merged_df.loc[merged_df['PtID'] == ptid, 'avg_hba1c'] = cgm_hba1c
        merged_df.loc[merged_df['PtID'] == ptid, 'hba1c_Group'] = get_hba1c_group(cgm_hba1c)

# Final Summary
print("\nFinal Summary:")
print("=" * 40)
print(f"Total subjects: {len(merged_df)}")
print(f"Subjects with lab HbA1c: {len(merged_df[merged_df['cgm_estimated_hba1c'].isna() & merged_df['avg_hba1c'].notna()])}")
print(f"Subjects with CGM-estimated HbA1c: {merged_df['cgm_estimated_hba1c'].notna().sum()}")
print(f"Subjects still missing HbA1c: {merged_df['avg_hba1c'].isna().sum()}")
print("\nHbA1c Group Distribution:")
print(merged_df['hba1c_Group'].value_counts(dropna=False))
print("\nSubjects still missing HbA1c by status:")
print(merged_df[merged_df['avg_hba1c'].isna()]['PtStatus'].value_counts(dropna=False))

# Save results
merged_df.to_csv(output_path, index=False)
print(f"\nSaved to: {output_path}")

Calculating average HbA1c from hba1c file...

Merging with demographic data...

Checking CGM participation...
Sample CGM files:
62.0_0.npy
91.0_0.npy
142.0_0.npy
145.0_0.npy
158.0_0.npy

Found 164 subjects with CGM data

Subjects missing CGM data by status:
PtStatus
Dropped      32
Completed     5
Name: count, dtype: int64

Calculating CGM-based HbA1c for eligible subjects...
Found 0 eligible subjects

Final Summary:
Total subjects: 201
Subjects with lab HbA1c: 175
Subjects with CGM-estimated HbA1c: 0
Subjects still missing HbA1c: 26

HbA1c Group Distribution:
hba1c_Group
<7%        175
Unknown     26
Name: count, dtype: int64

Subjects still missing HbA1c by status:
PtStatus
Dropped    26
Name: count, dtype: int64

Saved to: /content/drive/MyDrive/Digital Health/Healthy_data_time_trial/Shah_demographic_with_avg_hba1c.csv


In [None]:
import pandas as pd
import numpy as np
import zipfile

# Configuration
base_path = '/content/drive/MyDrive/Digital Health/Healthy_data_time_trial'
demographic_path = f'{base_path}/hall2018_hba1c_demo.csv'
cgm_zip_path = f'{base_path}/Hall2018.zip'
output_path = f'{base_path}/Hall_processed_demographic.csv'

# Load demographic data
print("Loading demographic data...")
demo_df = pd.read_csv(demographic_path)

# Rename columns to consistent naming
demo_df = demo_df.rename(columns={
    'id': 'PtID',
    'Age': 'AgeAtEnrollment',
    'A1C': 'avg_hba1c'  # Assuming this is the HbA1c column
})

# 1. Add clinical groupings
def get_hba1c_group(value):
    if pd.isna(value): return 'Unknown'
    value = float(value)
    if value < 7: return '<7%'
    if value < 8.5: return '7-8.5%'
    return '≥8.5%'

demo_df['hba1c_Group'] = demo_df['avg_hba1c'].apply(get_hba1c_group)

def get_age_group(age):
    if pd.isna(age): return 'Unknown'
    age = float(age)
    if age < 10: return '<10'
    if age < 20: return '10-19'
    if age < 65: return '20-64'
    return '≥65'

demo_df['age_group'] = demo_df['AgeAtEnrollment'].apply(get_age_group)

# 2. Check CGM Participation (handles 8xxx format in ZIP vs id in demographic)
def check_cgm_participation(zip_file):
    """Check which subjects have CGM data (handles 8xxx format)"""
    cgm_ids = set()
    for file in zip_file.namelist():
        if file.endswith('.npy') and not '_time.npy' in file:
            try:
                # Extract ID from filename (assuming format like "8001.npy" or "8001_0.npy")
                base_name = file.split('/')[-1].split('_')[0].replace('.npy', '')
                ptid = int(base_name)
                cgm_ids.add(ptid)
            except (ValueError, IndexError):
                continue
    return cgm_ids

print("\nChecking CGM participation...")
with zipfile.ZipFile(cgm_zip_path) as z:
    # Print sample files to verify format
    print("Sample CGM files:")
    for file in z.namelist()[:5]:
        print(file)

    cgm_participants = check_cgm_participation(z)
    print(f"\nFound {len(cgm_participants)} subjects with CGM data")
    demo_df['has_cgm'] = demo_df['PtID'].isin(cgm_participants)

# 3. Calculate HbA1c from CGM where lab values are missing
def calculate_hba1c_from_cgm(ptid):
    """Calculate HbA1c from CGM data using standard formula"""
    glucose_readings = []
    try:
        with zipfile.ZipFile(cgm_zip_path) as z:
            # Look for files matching either "ptid.npy" or "ptid_*.npy" pattern
            patterns = [f"{ptid}.npy", f"{ptid}_"]

            for pattern in patterns:
                for file in z.namelist():
                    if (file.startswith(pattern) and file.endswith('.npy')) and not '_time.npy' in file:
                        with z.open(file) as f:
                            data = np.load(f)
                            valid_values = data[~np.isnan(data)]
                            glucose_readings.extend(valid_values)

        if glucose_readings:
            mean_glucose = np.mean(glucose_readings)
            return (mean_glucose + 46.7) / 28.7
        return np.nan
    except Exception as e:
        print(f"Error processing PtID {ptid}: {str(e)}")
        return np.nan

# Initialize CGM estimated column
demo_df['cgm_estimated_hba1c'] = np.nan

print("\nCalculating CGM-based HbA1c for eligible subjects...")
eligible_pts = demo_df[
    (demo_df['avg_hba1c'].isna()) &
    (demo_df['has_cgm'])
]

print(f"Found {len(eligible_pts)} eligible subjects")

# Calculate and store CGM-based HbA1c
for _, row in eligible_pts.iterrows():
    ptid = row['PtID']
    cgm_hba1c = calculate_hba1c_from_cgm(ptid)
    if not pd.isna(cgm_hba1c):
        demo_df.loc[demo_df['PtID'] == ptid, 'cgm_estimated_hba1c'] = cgm_hba1c
        demo_df.loc[demo_df['PtID'] == ptid, 'avg_hba1c'] = cgm_hba1c
        demo_df.loc[demo_df['PtID'] == ptid, 'hba1c_Group'] = get_hba1c_group(cgm_hba1c)

# 4. Final Summary
print("\nFinal Summary:")
print("=" * 40)
print(f"Total subjects: {len(demo_df)}")
print(f"Subjects with lab HbA1c: {len(demo_df[demo_df['cgm_estimated_hba1c'].isna() & demo_df['avg_hba1c'].notna()])}")
print(f"Subjects with CGM-estimated HbA1c: {demo_df['cgm_estimated_hba1c'].notna().sum()}")
print(f"Subjects still missing HbA1c: {demo_df['avg_hba1c'].isna().sum()}")
print("\nHbA1c Group Distribution:")
print(demo_df['hba1c_Group'].value_counts(dropna=False))
print("\nSubjects missing CGM data:")
print(demo_df[~demo_df['has_cgm']]['PtID'].unique())

# Save results
demo_df.to_csv(output_path, index=False)
print(f"\nSaved to: {output_path}")

Loading demographic data...

Checking CGM participation...
Sample CGM files:
8024_0.npy
8010_0.npy
8027_0.npy
8009_0.npy
8033_0.npy

Found 51 subjects with CGM data

Calculating CGM-based HbA1c for eligible subjects...
Found 2 eligible subjects

Final Summary:
Total subjects: 57
Subjects with lab HbA1c: 55
Subjects with CGM-estimated HbA1c: 2
Subjects still missing HbA1c: 0

HbA1c Group Distribution:
hba1c_Group
<7%    57
Name: count, dtype: int64

Subjects missing CGM data:
[8001 8003 8007 8013 8016 8023]

Saved to: /content/drive/MyDrive/Digital Health/Healthy_data_time_trial/Hall_processed_demographic.csv
