In [None]:
# ============================================
# CELL 1: Setup & Imports
# ============================================


import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')



In [None]:
# ============================================
# CELL 2: Configuration
# ============================================

# Update these paths according to your Google Drive structure
BASE_PATH = "/content/drive/MyDrive/Colab Notebooks/Final Project/ds/"

# Input file (your merged data)
INPUT_FILE = os.path.join(BASE_PATH, "master-demographic-ppg-data.csv")

# Output files
OUTPUT_4140X = os.path.join(BASE_PATH, "CAD-patients-data.csv")
OUTPUT_CONTROL = os.path.join(BASE_PATH, "No-heart-disease-patients-data.csv")
OUTPUT_FULL_WITH_FLAGS = os.path.join(BASE_PATH, "PPG_cad-flagged.csv")

print("üìÅ File Paths Configured:")
print(f"   Input: {INPUT_FILE}")
print(f"   Output 414.0x: {OUTPUT_4140X}")
print(f"   Output Control: {OUTPUT_CONTROL}")

üìÅ File Paths Configured:
   Input: /content/drive/MyDrive/Colab Notebooks/Final Project/ds/master-demographic-ppg-data.csv
   Output 414.0x: /content/drive/MyDrive/Colab Notebooks/Final Project/ds/CAD-patients-data.csv
   Output Control: /content/drive/MyDrive/Colab Notebooks/Final Project/ds/No-heart-disease-patients-data.csv


In [None]:
# ============================================
# CELL 3: Load and Validate Data
# ============================================

print("üìÇ Loading data...")
try:
    df = pd.read_csv(INPUT_FILE)
    print(f"‚úÖ Data loaded: {len(df)} patients")

    # Check required columns
    required_cols = ['patient_id', 'SUBJECT_ID', 'ICD9_CODES']
    missing_cols = [col for col in required_cols if col not in df.columns]

    if missing_cols:
        print(f"‚ùå Missing columns: {missing_cols}")
        print("Please check your input file")
        df = None
    else:
        print(f"üìä Dataset Info:")
        print(f"   Shape: {df.shape}")
        print(f"   Columns: {len(df.columns)} total")
        print(f"   Missing ICD9_CODES: {df['ICD9_CODES'].isna().sum()} patients")

except FileNotFoundError:
    print(f"‚ùå File not found: {INPUT_FILE}")
    print("Please update the file path in CELL 2")
    df = None

df.head()

üìÇ Loading data...
‚úÖ Data loaded: 5147 patients
üìä Dataset Info:
   Shape: (5147, 14)
   Columns: 14 total
   Missing ICD9_CODES: 0 patients


Unnamed: 0,patient_id,SUBJECT_ID,group,record_id,fs,AGE,GENDER,ICD9_CODES,ppg_min_3,ppg_min_4,ppg_min_5,ppg_min_6,ppg_min_7,ppg_min_8
0,p010023,10023,p01,3247161_0004,125,69.0,M,25050;2724;36201;4019;43310,0.4194 0.4194 0.4194 0.4194 0.4194 0.4194 0.41...,0.3343 0.3304 0.3265 0.3226 0.3177 0.3138 0.30...,0.2326 0.2346 0.2405 0.2522 0.2698 0.2942 0.32...,0.6940 0.6794 0.6637 0.6461 0.6276 0.6090 0.58...,0.3607 0.3607 0.3607 0.3607 0.3607 0.3597 0.35...,0.3900 0.4242 0.4594 0.4927 0.5230 0.5503 0.57...
1,p010045,10045,p01,3746698_0001,125,68.0,F,00845;0383;1179;2449;25000;2724;2761;2859;2867...,0.6393 0.6246 0.6119 0.5992 0.5846 0.5709 0.55...,0.2903 0.2893 0.2874 0.2864 0.2845 0.2835 0.28...,0.2278 0.2620 0.2991 0.3372 0.3754 0.4125 0.44...,0.2727 0.2864 0.3001 0.3148 0.3294 0.3431 0.35...,0.2825 0.2815 0.2796 0.2766 0.2747 0.2727 0.27...,0.2835 0.2835 0.2835 0.2845 0.2874 0.2923 0.29...
2,p010049,10049,p01,3456766_0017,125,63.0,M,2720;4019;41011;4111;41401;41402;5118;5849;996...,0.4790 0.4761 0.4741 0.4731 0.4721 0.4721 0.47...,0.6706 0.6686 0.6657 0.6628 0.6588 0.6540 0.64...,0.3959 0.3949 0.3939 0.3930 0.3900 0.3881 0.38...,0.5122 0.5122 0.5112 0.5103 0.5103 0.5093 0.50...,0.5152 0.5142 0.5122 0.5054 0.4966 0.4907 0.48...,0.5034 0.5024 0.5015 0.4995 0.4976 0.4956 0.49...
3,p010083,10083,p01,3294536_0001,125,82.0,F,04104;04182;2554;261;2738;2739;2761;2762;27652...,0.4194 0.4174 0.4154 0.4145 0.4125 0.4115 0.41...,0.4966 0.4927 0.4897 0.4868 0.4839 0.4809 0.47...,0.5376 0.5327 0.5298 0.5269 0.5239 0.5230 0.52...,0.5210 0.5210 0.5200 0.5200 0.5191 0.5181 0.51...,0.4467 0.4467 0.4457 0.4457 0.4467 0.4467 0.44...,0.4506 0.4409 0.4330 0.4262 0.4370 0.4575 0.46...
4,p010124,10124,p01,3127404_0007,125,84.0,F,03811;2449;25000;25092;2767;27800;2851;32723;4...,0.3402 0.3412 0.3412 0.3421 0.3431 0.3431 0.34...,0.4653 0.4428 0.4242 0.4076 0.3910 0.3754 0.36...,0.5181 0.5318 0.5415 0.5494 0.5543 0.5572 0.55...,0.7595 0.7615 0.7615 0.7586 0.7537 0.7468 0.73...,0.6354 0.6246 0.6109 0.5973 0.5826 0.5660 0.54...,0.2708 0.2717 0.2737 0.2757 0.2776 0.2796 0.28...


In [None]:
# ============================================
# CELL 4: Define Extraction Functions
# ============================================

def has_code_4140x(icd9_string):
    """
    Check if ICD-9 string contains code starting with 414.0
    These are SPECIFICALLY coronary atherosclerosis codes
    """
    if pd.isna(icd9_string):
        return False

    codes = str(icd9_string).split(';')

    for code in codes:
        code = code.strip()
        # Check if code starts with 414.0 (coronary atherosclerosis)
        if code.startswith('414.0') or code == '4140' or code.startswith('4140'):
            return True

    return False

def extract_code_4140x_only(icd9_string):
    """
    Extract ONLY the 414.0x codes from ICD-9 string
    """
    if pd.isna(icd9_string):
        return ""

    codes = str(icd9_string).split(';')
    code_4140x_list = []

    for code in codes:
        code = code.strip()
        # Include only 414.0x codes
        if code.startswith('414.0') or code == '4140' or code.startswith('4140'):
            code_4140x_list.append(code)

    return ';'.join(code_4140x_list)

def has_any_heart_disease(icd9_string):
    """
    Check for ANY heart-related ICD-9 codes
    Used to create clean control group
    """
    if pd.isna(icd9_string):
        return False

    codes = str(icd9_string).split(';')

    # Comprehensive list of heart-related ICD-9 prefixes
    heart_prefixes = [
        # Ischemic heart disease (410-414)
        '410', '411', '412', '413', '414',
        # Heart failure (428)
        '428',
        # Hypertensive disease (401-405)
        '401', '402', '403', '404', '405',
        # Other heart disease (420-429)
        '420', '421', '422', '423', '424', '425', '426', '427', '429',
        # Cerebrovascular disease (430-438)
        '430', '431', '432', '433', '434', '435', '436', '437', '438',
        # Peripheral vascular disease (440-448)
        '440', '441', '442', '443', '444', '445', '446', '447', '448'
    ]

    for code in codes:
        code = code.strip()
        for prefix in heart_prefixes:
            if code.startswith(prefix):
                return True

    return False

print("‚úÖ Functions defined for 414.0x extraction")

‚úÖ Functions defined for 414.0x extraction


In [None]:
# ============================================
# CELL 5: Apply Filters and Create Groups
# ============================================

if df is not None:
    print("\n" + "="*60)
    print("üîç APPLYING 414.0x FILTERS")
    print("="*60)

    # Step 1: Add 414.0x flag
    df['has_4140x'] = df['ICD9_CODES'].apply(has_code_4140x)

    # Step 2: Extract specific 414.0x codes
    df['4140x_codes'] = df['ICD9_CODES'].apply(extract_code_4140x_only)

    # Step 3: Create CAD group (414.0x patients)
    cad_4140x_df = df[df['has_4140x']].copy()

    # Step 4: Add heart disease flag for control group
    df['has_any_heart_disease'] = df['ICD9_CODES'].apply(has_any_heart_disease)

    # Step 5: Create control group (NO heart disease codes)
    control_df = df[~df['has_any_heart_disease']].copy()

    # Step 6: Create "other heart disease" group (for reference)
    other_heart_df = df[df['has_any_heart_disease'] & ~df['has_4140x']].copy()

    print("‚úÖ Filters applied successfully!")
    print(f"\nüìä GROUP SIZES:")
    print(f"   CAD (414.0x) patients: {len(cad_4140x_df)}")
    print(f"   Healthy controls (no heart codes): {len(control_df)}")
    print(f"   Other heart disease patients: {len(other_heart_df)}")
    print(f"   Total patients: {len(df)}")
else:
    print("‚ùå Cannot proceed - data not loaded properly")


üîç APPLYING 414.0x FILTERS
‚úÖ Filters applied successfully!

üìä GROUP SIZES:
   CAD (414.0x) patients: 1508
   Healthy controls (no heart codes): 799
   Other heart disease patients: 2840
   Total patients: 5147


In [None]:
# ============================================
# NEW CELL: Add ICD-9 Code Meanings from CSV
# ============================================

# 1. Load the diagnostic descriptions
diagnoses_lookup = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Final Project/ALL DATA SETS/matched/D_ICD_DIAGNOSES.csv')

# 2. Create a lookup dictionary (removing dots for consistent matching)
# We ensure the key is a string and matches the format in the notebook
code_to_meaning = dict(zip(diagnoses_lookup['ICD9_CODE'].astype(str),
                           diagnoses_lookup['LONG_TITLE']))

def get_icd9_descriptions(codes_str):
    """
    Looks up meanings for a semicolon-separated string of codes
    """
    if not codes_str or pd.isna(codes_str):
        return ""

    # Split the codes (e.g., "414.01;414.00")
    codes = str(codes_str).split(';')
    meanings = []

    for code in codes:
        # Clean the code: remove decimals and whitespace to match CSV format
        clean_code = code.replace('.', '').strip()
        meaning = code_to_meaning.get(clean_code, "Description not found")
        meanings.append(f"{code}: {meaning}")

    return ' | '.join(meanings)

# 3. Apply the mapping to the CAD dataframe
if 'cad_4140x_df' in locals() and not cad_4140x_df.empty:
    cad_4140x_df['code_meanings'] = cad_4140x_df['4140x_codes'].apply(get_icd9_descriptions)
    print("‚úÖ Added 'code_meanings' column to cad_4140x_df")

    # Display sample results
    display(cad_4140x_df[['patient_id', '4140x_codes', 'code_meanings']].head())
else:
    print("‚ö†Ô∏è cad_4140x_df not found. Please run previous cells first.")

‚úÖ Added 'code_meanings' column to cad_4140x_df


Unnamed: 0,patient_id,4140x_codes,code_meanings
1,p010045,41401,41401: Coronary atherosclerosis of native coro...
2,p010049,41401;41402,41401: Coronary atherosclerosis of native coro...
5,p010134,41400,41400: Coronary atherosclerosis of unspecified...
6,p010152,41401,41401: Coronary atherosclerosis of native coro...
7,p010188,41401,41401: Coronary atherosclerosis of native coro...


In [None]:
# ============================================
# CELL 6: Analyze Specific 414.0x Codes
# ============================================

if df is not None and len(cad_4140x_df) > 0:
    print("\n" + "="*60)
    print("üß¨ SPECIFIC 414.0x CODE BREAKDOWN")
    print("="*60)

    # Count each specific 414.0x code
    from collections import Counter

    all_4140x_codes = []
    for codes in cad_4140x_df['4140x_codes']:
        if codes:
            all_4140x_codes.extend(codes.split(';'))

    code_counts = Counter(all_4140x_codes)

    # Clinical descriptions of 414.0x codes
    code_descriptions = {
        '414.0': 'Coronary atherosclerosis, unspecified',
        '414.00': 'Coronary atherosclerosis of unspecified type of vessel',
        '414.01': 'Coronary atherosclerosis of native coronary artery',
        '414.02': 'Coronary atherosclerosis of autologous vein bypass graft',
        '414.03': 'Coronary atherosclerosis of nonautologous biological bypass graft',
        '414.04': 'Coronary atherosclerosis of artery bypass graft',
        '414.05': 'Coronary atherosclerosis of unspecified bypass graft',
        '414.0': 'Coronary atherosclerosis (general)',
        '4140': 'Coronary atherosclerosis (ICD-9 without decimal)'
    }

    print("\nüìã CODE DISTRIBUTION:")
    total_codes = sum(code_counts.values())
    for code, count in sorted(code_counts.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / total_codes) * 100
        description = code_descriptions.get(code, "Unknown specific type")
        print(f"   {code}: {count} patients ({percentage:.1f}%) - {description}")

    # Patients with multiple 414.0x codes
    patients_with_multiple = cad_4140x_df['4140x_codes'].apply(lambda x: len(x.split(';')) if x else 0)
    multi_code_patients = (patients_with_multiple > 1).sum()

    print(f"\nüìà ADDITIONAL STATS:")
    print(f"   Patients with multiple 414.0x codes: {multi_code_patients}")
    print(f"   Average codes per patient: {patients_with_multiple.mean():.2f}")
else:
    print("No 414.0x patients found to analyze")


üß¨ SPECIFIC 414.0x CODE BREAKDOWN

üìã CODE DISTRIBUTION:
   41401: 1359 patients (80.3%) - Unknown specific type
   41400: 259 patients (15.3%) - Unknown specific type
   41402: 67 patients (4.0%) - Unknown specific type
   41404: 5 patients (0.3%) - Unknown specific type
   41405: 3 patients (0.2%) - Unknown specific type

üìà ADDITIONAL STATS:
   Patients with multiple 414.0x codes: 166
   Average codes per patient: 1.12


In [None]:
# ============================================
# CELL 7: Demographic Comparison
# ============================================

if df is not None and len(cad_4140x_df) > 0 and len(control_df) > 0:
    print("\n" + "="*60)
    print("üë• DEMOGRAPHIC COMPARISON")
    print("="*60)

    # Age comparison
    if 'AGE' in df.columns:
        print("\nüìä AGE DISTRIBUTION:")
        print(f"   CAD (414.0x) patients:")
        print(f"     Mean: {cad_4140x_df['AGE'].mean():.1f} ¬± {cad_4140x_df['AGE'].std():.1f}")
        print(f"     Range: {cad_4140x_df['AGE'].min():.0f} - {cad_4140x_df['AGE'].max():.0f}")

        print(f"\n   Control patients:")
        print(f"     Mean: {control_df['AGE'].mean():.1f} ¬± {control_df['AGE'].std():.1f}")
        print(f"     Range: {control_df['AGE'].min():.0f} - {control_df['AGE'].max():.0f}")

    # Gender comparison
    if 'GENDER' in df.columns:
        print("\nüöª GENDER DISTRIBUTION:")

        print(f"   CAD (414.0x) patients:")
        cad_gender = cad_4140x_df['GENDER'].value_counts()
        for gender, count in cad_gender.items():
            pct = (count / len(cad_4140x_df)) * 100
            print(f"     {gender}: {count} ({pct:.1f}%)")

        print(f"\n   Control patients:")
        control_gender = control_df['GENDER'].value_counts()
        for gender, count in control_gender.items():
            pct = (count / len(control_df)) * 100
            print(f"     {gender}: {count} ({pct:.1f}%)")

    # Age group comparison
    if 'AGE' in df.columns:
        print(f"\nüéØ AGE GROUPS:")

        def get_age_group(age):
            if age < 50: return "<50"
            elif age < 60: return "50-59"
            elif age < 70: return "60-69"
            elif age < 80: return "70-79"
            else: return "80+"

        cad_4140x_df['age_group'] = cad_4140x_df['AGE'].apply(get_age_group)
        control_df['age_group'] = control_df['AGE'].apply(get_age_group)

        print(f"   CAD (414.0x) age groups:")
        for group, count in cad_4140x_df['age_group'].value_counts().sort_index().items():
            pct = (count / len(cad_4140x_df)) * 100
            print(f"     {group}: {count} ({pct:.1f}%)")
else:
    print("Insufficient data for demographic analysis")


üë• DEMOGRAPHIC COMPARISON

üìä AGE DISTRIBUTION:
   CAD (414.0x) patients:
     Mean: 71.3 ¬± 12.4
     Range: 26 - 90

   Control patients:
     Mean: 48.1 ¬± 17.1
     Range: 0 - 90

üöª GENDER DISTRIBUTION:
   CAD (414.0x) patients:
     M: 957 (63.5%)
     F: 551 (36.5%)

   Control patients:
     M: 403 (50.4%)
     F: 396 (49.6%)

üéØ AGE GROUPS:
   CAD (414.0x) age groups:
     50-59: 202 (13.4%)
     60-69: 384 (25.5%)
     70-79: 393 (26.1%)
     80+: 461 (30.6%)
     <50: 68 (4.5%)


In [None]:
# ============================================
# CELL 8: Save Datasets to Files
# ============================================

if df is not None:
    print("\n" + "="*60)
    print("üíæ SAVING DATASETS")
    print("="*60)

    # 1. Save 414.0x patients (Coronary Atherosclerosis)
    cad_4140x_df.to_csv(OUTPUT_4140X, index=False)
    print(f"‚úÖ CAD (414.0x) patients saved: {len(cad_4140x_df)} patients")
    print(f"   File: {OUTPUT_4140X}")

    # 2. Save control group
    control_df.to_csv(OUTPUT_CONTROL, index=False)
    print(f"‚úÖ Control group saved: {len(control_df)} patients")
    print(f"   File: {OUTPUT_CONTROL}")

    # 3. Save full dataset with flags
    df.to_csv(OUTPUT_FULL_WITH_FLAGS, index=False)
    print(f"‚úÖ Full dataset with flags saved: {len(df)} patients")
    print(f"   File: {OUTPUT_FULL_WITH_FLAGS}")

    # 4. Optional: Save other heart disease patients
    if len(other_heart_df) > 0:
        output_other = os.path.join(BASE_PATH, "PPG_OTHER_HEART_DISEASE.csv")
        other_heart_df.to_csv(output_other, index=False)
        print(f"‚úÖ Other heart disease patients saved: {len(other_heart_df)} patients")
        print(f"   File: {output_other}")

    print("\nüìÅ ALL FILES SAVED SUCCESSFULLY!")
else:
    print("‚ùå No data to save")


üíæ SAVING DATASETS
‚úÖ CAD (414.0x) patients saved: 1508 patients
   File: /content/drive/MyDrive/Colab Notebooks/Final Project/ds/CAD-patients-data.csv
‚úÖ Control group saved: 799 patients
   File: /content/drive/MyDrive/Colab Notebooks/Final Project/ds/No-heart-disease-patients-data.csv
‚úÖ Full dataset with flags saved: 5147 patients
   File: /content/drive/MyDrive/Colab Notebooks/Final Project/ds/PPG_cad-flagged.csv
‚úÖ Other heart disease patients saved: 2840 patients
   File: /content/drive/MyDrive/Colab Notebooks/Final Project/ds/PPG_OTHER_HEART_DISEASE.csv

üìÅ ALL FILES SAVED SUCCESSFULLY!


In [None]:
# ============================================
# CELL 9: Create Balanced ML-Ready Dataset (80/20 Split)
# ============================================
from sklearn.model_selection import train_test_split

if df is not None and len(cad_4140x_df) > 0 and len(control_df) > 0:
    print("\n" + "="*60)
    print("ü§ñ CREATING BALANCED & SPLIT ML DATASET")
    print("="*60)

    # 1. Create binary labels
    cad_4140x_df['label'] = 1  # CAD positive
    control_df['label'] = 0    # Healthy control

    # 2. Balance the dataset (Equal amount of both classes)
    # We take the size of the smaller group to determine the count for both
    n_samples = min(len(cad_4140x_df), len(control_df))

    balanced_cad = cad_4140x_df.sample(n=n_samples, random_state=42)
    balanced_control = control_df.sample(n=n_samples, random_state=42)

    # Combine into one balanced dataset
    ml_dataset_balanced = pd.concat([balanced_cad, balanced_control], ignore_index=True)

    print(f"‚öñÔ∏è Dataset Balanced: {n_samples} patients per class ({n_samples * 2} total)")

    # 3. Stratified Train/Test Split (80% Train, 20% Test)
    # Stratify=y ensures both sets have the same ratio of CAD vs Control
    train_df, test_df = train_test_split(
        ml_dataset_balanced,
        test_size=0.20,
        random_state=42,
        stratify=ml_dataset_balanced['label']
    )

    # 4. Save the datasets
    output_train = os.path.join(BASE_PATH, "PPG_train_data.csv")
    output_test = os.path.join(BASE_PATH, "PPG_test_data.csv")

    train_df.to_csv(output_train, index=False)
    test_df.to_csv(output_test, index=False)

    print(f"‚úÖ Training set saved: {len(train_df)} patients -> {output_train}")
    print(f"‚úÖ Testing set saved: {len(test_df)} patients -> {output_test}")

    # Validation Statistics
    print(f"\nüìä DATASET DISTRIBUTION:")
    print(f"   Total Balanced Samples: {len(ml_dataset_balanced)}")
    print(f"   Training Samples (80%): {len(train_df)} (CAD: {sum(train_df['label']==1)}, Control: {sum(train_df['label']==0)})")
    print(f"   Testing Samples (20%):  {len(test_df)} (CAD: {sum(test_df['label']==1)}, Control: {sum(test_df['label']==0)})")

    # Show sample
    print(f"\nüß™ SAMPLE OF BALANCED TRAINING DATA:")
    sample_cols = ['patient_id', 'AGE', 'GENDER', 'label']
    existing_cols = [col for col in sample_cols if col in train_df.columns]
    print(train_df[existing_cols].head().to_string())

else:
    print("‚ùå Insufficient data to create ML dataset")


ü§ñ CREATING BALANCED & SPLIT ML DATASET
‚öñÔ∏è Dataset Balanced: 799 patients per class (1598 total)
‚úÖ Training set saved: 1278 patients -> /content/drive/MyDrive/Colab Notebooks/Final Project/ds/PPG_train_data.csv
‚úÖ Testing set saved: 320 patients -> /content/drive/MyDrive/Colab Notebooks/Final Project/ds/PPG_test_data.csv

üìä DATASET DISTRIBUTION:
   Total Balanced Samples: 1598
   Training Samples (80%): 1278 (CAD: 639, Control: 639)
   Testing Samples (20%):  320 (CAD: 160, Control: 160)

üß™ SAMPLE OF BALANCED TRAINING DATA:
     patient_id   AGE GENDER  label
1331    p047874  56.0      F      0
747     p056697  80.0      M      1
1397    p067227  60.0      M      0
1407    p093566  33.0      F      0
627     p087716  52.0      M      1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ============================================
# CELL 10: Generate Final Summary
# ============================================

print("\n" + "="*60)
print("üìã EXTRACTION PIPELINE SUMMARY")
print("="*60)

if df is not None:
    print(f"\nüéØ PRIMARY FINDINGS:")
    print(f"   1. CAD (414.0x) patients: {len(cad_4140x_df) if 'cad_4140x_df' in locals() else 'N/A'}")
    print(f"   2. Healthy controls: {len(control_df) if 'control_df' in locals() else 'N/A'}")
    print(f"   3. Dataset balance: CAD:Control ‚âà 1:{len(control_df)/len(cad_4140x_df):.1f}" if 'cad_4140x_df' in locals() and len(cad_4140x_df) > 0 else "")

    print(f"\nüìÅ FILES CREATED:")
    print(f"   1. {OUTPUT_4140X} - Coronary atherosclerosis patients")
    print(f"   2. {OUTPUT_CONTROL} - Healthy control patients")
    print(f"   3. {OUTPUT_FULL_WITH_FLAGS} - Full dataset with CAD flags")

    if 'output_ml' in locals():
        print(f"   4. {output_ml} - ML-ready labeled dataset")

    print(f"\nüöÄ NEXT STEPS:")
    print(f"   1. Load the ML dataset for feature extraction")
    print(f"   2. Process PPG signals (ppg_min_3 to ppg_min_8)")
    print(f"   3. Extract cardiac features from PPG")
    print(f"   4. Train CAD vs Control classification model")

    print(f"\n‚ö†Ô∏è  IMPORTANT MEDICAL DISCLAIMER:")
    print(f"   ‚Ä¢ 414.0x codes = Coronary atherosclerosis (CAD)")
    print(f"   ‚Ä¢ This is BILLING CODE data, not confirmed diagnosis")
    print(f"   ‚Ä¢ Controls have NO heart codes but may have undiagnosed CAD")
    print(f"   ‚Ä¢ For clinical use, validation with actual diagnoses is required")

    print(f"\n‚úÖ PIPELINE COMPLETE! You have successfully extracted:")
    print(f"   üî¥ {len(cad_4140x_df) if 'cad_4140x_df' in locals() else 0} CAD patients (414.0x)")
    print(f"   üü¢ {len(control_df) if 'control_df' in locals() else 0} Control patients")
    print(f"   üìä Total labeled: {(len(cad_4140x_df) + len(control_df)) if 'cad_4140x_df' in locals() and 'control_df' in locals() else 0}")

else:
    print("‚ùå Pipeline failed - check file paths and data format")


üìã EXTRACTION PIPELINE SUMMARY

üéØ PRIMARY FINDINGS:
   1. CAD (414.0x) patients: 1508
   2. Healthy controls: 799
   3. Dataset balance: CAD:Control ‚âà 1:0.5

üìÅ FILES CREATED:
   1. /content/drive/MyDrive/Colab Notebooks/Final Project/ds/CAD-patients-data.csv - Coronary atherosclerosis patients
   2. /content/drive/MyDrive/Colab Notebooks/Final Project/ds/No-heart-disease-patients-data.csv - Healthy control patients
   3. /content/drive/MyDrive/Colab Notebooks/Final Project/ds/PPG_cad-flagged.csv - Full dataset with CAD flags
   4. /content/drive/MyDrive/Colab Notebooks/Final Project/ds/PPG_ml&control_data.csv - ML-ready labeled dataset

üöÄ NEXT STEPS:
   1. Load the ML dataset for feature extraction
   2. Process PPG signals (ppg_min_3 to ppg_min_8)
   3. Extract cardiac features from PPG
   4. Train CAD vs Control classification model

‚ö†Ô∏è  IMPORTANT MEDICAL DISCLAIMER:
   ‚Ä¢ 414.0x codes = Coronary atherosclerosis (CAD)
   ‚Ä¢ This is BILLING CODE data, not confirmed

In [None]:
x = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Final Project/ds/PPG_train_data.csv')
x.head()

Unnamed: 0,patient_id,SUBJECT_ID,group,record_id,fs,AGE,GENDER,ICD9_CODES,ppg_min_3,ppg_min_4,ppg_min_5,ppg_min_6,ppg_min_7,ppg_min_8,has_4140x,4140x_codes,code_meanings,age_group,label,has_any_heart_disease
0,p047874,47874,p04,3602666_0004,125,56.0,F,2449;25000;2720;2762;27652;496;51919;53081;584...,1.5318 1.5288 1.5249 1.5220 1.5181 1.5152 1.51...,2.1632 2.1427 2.1212 2.1007 2.0792 2.0606 2.04...,1.7370 1.7234 1.7107 1.7009 1.6931 1.6862 1.68...,1.5112 1.5064 1.5044 1.5015 1.4985 1.4956 1.49...,2.0469 2.0411 2.0342 2.0254 2.0147 2.0020 1.98...,1.8270 1.8240 1.8192 1.8143 1.8113 1.8104 1.80...,False,,,50-59,0,False
1,p056697,56697,p05,3131252_0022,125,80.0,M,25000;2724;32723;41401;4168;4241;42731;4275;42...,0.3333 0.3294 0.3294 0.3255 0.3216 0.3216 0.31...,0.2980 0.2941 0.2902 0.2863 0.2863 0.2824 0.28...,0.2471 0.2431 0.2431 0.2431 0.2431 0.2392 0.23...,0.3059 0.3020 0.3020 0.2980 0.2980 0.2941 0.29...,0.3176 0.3176 0.3216 0.3255 0.3333 0.3412 0.35...,0.4471 0.4392 0.4353 0.4275 0.4235 0.4196 0.41...,True,41401.0,41401: Coronary atherosclerosis of native coro...,80+,1,
2,p067227,67227,p06,3540034_0004,125,60.0,M,03849;07054;1550;2859;3004;51189;5609;5715;576...,1.4086 1.3920 1.3744 1.3548 1.3343 1.3128 1.29...,2.6403 2.5718 2.4995 2.4145 2.3226 2.2297 2.14...,0.9717 0.9550 0.9384 0.9277 0.9189 0.9032 0.88...,1.8035 1.8025 1.8006 1.7986 1.7957 1.7928 1.78...,2.0010 2.0010 2.0010 2.0010 2.0010 2.0010 2.00...,2.6989 2.6940 2.6891 2.6843 2.6794 2.6764 2.67...,False,,,60-69,0,False
3,p093566,93566,p09,3110153_0005,125,33.0,F,03812;1173;2768;2851;2866;2875;34830;34982;453...,2.2962 2.2952 2.2942 2.2923 2.2903 2.2893 2.28...,2.9687 2.9912 3.0137 3.0450 3.0811 3.1193 3.15...,2.1730 2.1642 2.1554 2.1427 2.1251 2.1105 2.09...,2.0420 2.0880 2.1300 2.1662 2.1906 2.2063 2.21...,1.6422 1.6461 1.6500 1.6559 1.6637 1.6716 1.67...,1.6393 1.5924 1.5679 1.5552 1.5484 1.5445 1.54...,False,,,<50,0,False
4,p087716,87716,p08,3039523_0004,125,52.0,M,2724;2749;2859;3051;4019;41401;45829;53081,0.2902 0.2902 0.2863 0.2863 0.2902 0.2941 0.30...,0.3451 0.3412 0.3373 0.3333 0.3294 0.3255 0.32...,0.2510 0.2510 0.2510 0.2549 0.2549 0.2549 0.25...,0.2667 0.2667 0.2627 0.2667 0.2667 0.2706 0.27...,0.3137 0.3098 0.3059 0.3059 0.3020 0.2980 0.29...,0.4941 0.4784 0.4627 0.4471 0.4353 0.4196 0.40...,True,41401.0,41401: Coronary atherosclerosis of native coro...,50-59,1,


In [None]:
x.tail()

Unnamed: 0,patient_id,SUBJECT_ID,group,record_id,fs,AGE,GENDER,ICD9_CODES,ppg_min_3,ppg_min_4,ppg_min_5,ppg_min_6,ppg_min_7,ppg_min_8,has_4140x,4140x_codes,code_meanings,age_group,label,has_any_heart_disease
1273,p090544,90544,p09,3134946_0007,125,61.0,F,2449;2760;41401;5990;80222;8024;8052;81301;813...,0.3922 0.3922 0.3922 0.3922 0.3922 0.3922 0.39...,0.4275 0.4275 0.4235 0.4196 0.4157 0.4118 0.41...,0.4118 0.4118 0.4078 0.4078 0.4078 0.4078 0.40...,0.5020 0.4980 0.4980 0.4941 0.4902 0.4863 0.48...,0.5765 0.5686 0.5608 0.5569 0.5529 0.5490 0.54...,0.4196 0.4196 0.4196 0.4196 0.4196 0.4196 0.41...,True,41401.0,41401: Coronary atherosclerosis of native coro...,60-69,1,
1274,p092055,92055,p09,3773563_0003,125,52.0,F,2252;2449;29680;34540;3485;7242;7840,2.0166 2.0371 2.0577 2.0762 2.0929 2.1095 2.12...,1.7370 1.7410 1.7498 1.7566 1.7605 1.7625 1.76...,1.7048 1.7038 1.7058 1.7097 1.7204 1.7380 1.75...,1.9717 1.9580 1.9453 1.9384 1.9355 1.9335 1.92...,1.9071 1.8974 1.8925 1.8837 1.8671 1.8465 1.82...,1.8778 1.8944 1.9062 1.9120 1.9150 1.9159 1.91...,False,,,50-59,0,False
1275,p070933,70933,p07,3572608_0001,125,67.0,F,2449;2851;4019;41071;4111;41401;4280;53081,0.6979 0.6921 0.6852 0.6774 0.6686 0.6569 0.64...,0.6139 0.6012 0.5875 0.5728 0.5582 0.5425 0.52...,0.4526 0.4477 0.4438 0.4399 0.4370 0.4340 0.43...,0.7195 0.7273 0.7322 0.7351 0.7361 0.7341 0.73...,0.2815 0.2805 0.2796 0.2786 0.2776 0.2776 0.27...,0.3939 0.4164 0.4438 0.4731 0.5034 0.5308 0.55...,True,41401.0,41401: Coronary atherosclerosis of native coro...,60-69,1,
1276,p090902,90902,p09,3413689_0006,125,56.0,M,25000;2761;30391;3484;45182;53081;85221;E8859;...,1.9286 1.9052 1.8817 1.8583 1.8368 1.8182 1.80...,2.3539 2.3255 2.2972 2.2708 2.2454 2.2160 2.18...,2.0714 2.0538 2.0371 2.0225 2.0088 1.9961 1.98...,1.9814 2.0244 2.0762 2.1359 2.1994 2.2649 2.32...,3.5396 3.5161 3.4819 3.4379 3.3822 3.3138 3.23...,1.1417 1.1388 1.1369 1.1349 1.1329 1.1320 1.13...,False,,,50-59,0,False
1277,p085401,85401,p08,3163473_0086,125,36.0,M,04149;2760;27669;30500;34461;51851;56481;6073;...,0.7137 0.7176 0.7098 0.6980 0.6824 0.6588 0.63...,0.3176 0.3176 0.3176 0.3137 0.3137 0.3098 0.30...,0.3294 0.3255 0.3216 0.3176 0.3137 0.3176 0.32...,0.2980 0.2941 0.2902 0.2863 0.2824 0.2824 0.28...,0.2784 0.2745 0.2706 0.2667 0.2627 0.2627 0.26...,0.4471 0.4863 0.5333 0.5765 0.6157 0.6549 0.68...,False,,,<50,0,False


In [None]:
y = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Final Project/ds/No-heart-disease-patients-data.csv')

In [None]:
y.head()

Unnamed: 0,patient_id,SUBJECT_ID,group,record_id,fs,AGE,GENDER,ICD9_CODES,ppg_min_3,ppg_min_4,ppg_min_5,ppg_min_6,ppg_min_7,ppg_min_8,has_4140x,4140x_codes,has_any_heart_disease,age_group
0,p010842,10842,p01,3523295_0005,125,55.0,M,07070;25000;2800;2869;2875;30393;45620;4568;57...,0.3177 0.3128 0.3089 0.3040 0.2991 0.2942 0.28...,0.2561 0.2542 0.2542 0.2561 0.2639 0.2708 0.27...,0.2835 0.2786 0.2737 0.2698 0.2649 0.2610 0.25...,0.3812 0.3744 0.3685 0.3656 0.3627 0.3607 0.35...,0.7429 0.7341 0.7234 0.7126 0.7009 0.6882 0.67...,0.2747 0.2688 0.2639 0.2610 0.2571 0.2542 0.25...,False,,False,50-59
1,p012175,12175,p01,3167090_0006,125,39.0,F,2809;2858;53550;5533;5789,0.3882 0.4392 0.4902 0.5412 0.5922 0.6353 0.67...,0.2549 0.2667 0.2824 0.3098 0.3451 0.3882 0.43...,0.3255 0.3137 0.3059 0.2980 0.2980 0.2980 0.29...,0.7059 0.7059 0.7059 0.6980 0.6863 0.6745 0.65...,0.6824 0.6667 0.6471 0.6314 0.6078 0.5882 0.56...,0.3176 0.3176 0.3176 0.3176 0.3137 0.3137 0.30...,False,,False,<50
2,p012182,12182,p01,3916495_0001,125,35.0,M,07032;1550;2859;2880;34830;5738,0.6393 0.6774 0.7067 0.7253 0.7341 0.7341 0.72...,0.3284 0.3333 0.3402 0.3480 0.3558 0.3636 0.37...,0.7439 0.7586 0.7654 0.7674 0.7703 0.7752 0.78...,0.5357 0.5112 0.4878 0.4643 0.4418 0.4194 0.39...,0.3871 0.3920 0.3959 0.3978 0.3978 0.3959 0.39...,0.8025 0.7996 0.7918 0.7810 0.7674 0.7517 0.73...,False,,False,<50
3,p012344,12344,p01,3256190_0005,125,72.0,M,04104;1209;24900;2639;2849;2859;45620;55320;56...,2.8162 2.8025 2.7840 2.7595 2.7273 2.6882 2.64...,2.1808 2.1271 2.0772 2.0362 2.0010 1.9668 1.93...,1.3480 1.3275 1.3079 1.2893 1.2708 1.2581 1.24...,2.7566 2.7781 2.8065 2.8368 2.8671 2.8954 2.92...,2.8201 2.8299 2.8338 2.8309 2.8201 2.8035 2.78...,1.8407 1.8133 1.7840 1.7537 1.7234 1.6931 1.66...,False,,False,70-79
4,p012388,12388,p01,3328892_0021,125,51.0,F,29570;41511;486;53081;V1251,0.5725 0.5843 0.5882 0.5882 0.5843 0.5765 0.56...,0.3569 0.3804 0.4078 0.4353 0.4588 0.4784 0.49...,0.4510 0.4471 0.4471 0.4431 0.4431 0.4392 0.43...,0.4627 0.4588 0.4549 0.4549 0.4549 0.4510 0.44...,0.5608 0.5569 0.5490 0.5412 0.5373 0.5373 0.53...,0.5843 0.5765 0.5725 0.5647 0.5608 0.5569 0.55...,False,,False,50-59


In [None]:
y.tail()

Unnamed: 0,patient_id,SUBJECT_ID,group,record_id,fs,AGE,GENDER,ICD9_CODES,ppg_min_3,ppg_min_4,ppg_min_5,ppg_min_6,ppg_min_7,ppg_min_8,has_4140x,4140x_codes,has_any_heart_disease,age_group
577,p089782,89782,p08,3026077_0004,125,51.0,M,1505;25000;2851;30000;49390;53087;V153;V1582;V...,0.2863 0.2824 0.2824 0.2824 0.2784 0.2745 0.27...,0.2471 0.2431 0.2431 0.2431 0.2431 0.2471 0.25...,0.4039 0.3961 0.3843 0.3765 0.3686 0.3608 0.35...,0.3294 0.3686 0.4118 0.4588 0.5137 0.5725 0.62...,0.6706 0.6471 0.6196 0.5922 0.5647 0.5412 0.51...,0.2980 0.2941 0.2941 0.2902 0.2863 0.2863 0.28...,False,,False,50-59
578,p089792,89792,p08,3674666_0026,125,53.0,M,07071;30393;45621;5531;5601;56723;5712;5715;57...,0.5569 0.5490 0.5412 0.5333 0.5255 0.5137 0.50...,0.2706 0.2706 0.2706 0.2667 0.2667 0.2667 0.26...,0.4314 0.4275 0.4196 0.4118 0.4078 0.4000 0.39...,0.2627 0.2627 0.2627 0.2627 0.2627 0.2627 0.26...,0.2706 0.2706 0.2706 0.2706 0.2706 0.2706 0.27...,0.2157 0.2157 0.2157 0.2157 0.2157 0.2235 0.23...,False,,False,50-59
579,p089849,89849,p08,3518367_0011,125,26.0,F,1121;2762;27652;2851;2859;33829;34690;4589;486...,0.7302 0.7155 0.6970 0.6764 0.6530 0.6276 0.60...,0.4203 0.4233 0.4262 0.4262 0.4262 0.4242 0.42...,0.3167 0.3099 0.3030 0.2972 0.2903 0.2835 0.27...,0.2845 0.2805 0.2786 0.2766 0.2737 0.2708 0.26...,0.3969 0.3998 0.4018 0.4027 0.4037 0.4037 0.40...,0.2659 0.2620 0.2590 0.2581 0.2581 0.2620 0.27...,False,,False,<50
580,p089953,89953,p08,3478068_0012,125,42.0,F,1703;27669;30000;4779;72401,0.2863 0.2824 0.2784 0.2784 0.2824 0.2863 0.29...,0.2902 0.2980 0.3059 0.3137 0.3255 0.3333 0.34...,0.3882 0.3843 0.3843 0.3843 0.3843 0.3843 0.39...,0.3765 0.3922 0.4118 0.4353 0.4667 0.5020 0.53...,0.5020 0.5373 0.5725 0.6078 0.6431 0.6745 0.70...,0.2824 0.2824 0.2824 0.2824 0.2902 0.2941 0.30...,False,,False,<50
581,p089965,89965,p08,3488119_0008,125,52.0,M,3004;4589;53081;5716;5761;7245;V1209,1.3333 1.3284 1.3324 1.3460 1.3685 1.4018 1.44...,3.3148 3.4966 3.6452 3.7400 3.7820 3.7849 3.76...,1.4790 1.4633 1.4497 1.4379 1.4291 1.4233 1.42...,3.6041 3.6070 3.6197 3.6383 3.6618 3.6862 3.70...,1.0117 1.0108 1.0117 1.0108 1.0078 1.0039 1.00...,2.9609 2.9326 2.9022 2.8719 2.8416 2.8143 2.78...,False,,False,50-59


In [None]:
z = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Final Project/ds/PPG_ml&control_data.csv')

In [None]:
z.head()

Unnamed: 0,patient_id,SUBJECT_ID,group,record_id,fs,AGE,GENDER,ICD9_CODES,ppg_min_3,ppg_min_4,...,ppg_min_6,ppg_min_7,ppg_min_8,has_4140x,4140x_codes,code_meanings,age_group,label,has_any_heart_disease,dataset_source
0,p010045,10045,p01,3746698_0001,125,68.0,F,00845;0383;1179;2449;25000;2724;2761;2859;2867...,0.6393 0.6246 0.6119 0.5992 0.5846 0.5709 0.55...,0.2903 0.2893 0.2874 0.2864 0.2845 0.2835 0.28...,...,0.2727 0.2864 0.3001 0.3148 0.3294 0.3431 0.35...,0.2825 0.2815 0.2796 0.2766 0.2747 0.2727 0.27...,0.2835 0.2835 0.2835 0.2845 0.2874 0.2923 0.29...,True,41401,41401: Coronary atherosclerosis of native coro...,60-69,1,,CAD_4140x
1,p010049,10049,p01,3456766_0017,125,63.0,M,2720;4019;41011;4111;41401;41402;5118;5849;996...,0.4790 0.4761 0.4741 0.4731 0.4721 0.4721 0.47...,0.6706 0.6686 0.6657 0.6628 0.6588 0.6540 0.64...,...,0.5122 0.5122 0.5112 0.5103 0.5103 0.5093 0.50...,0.5152 0.5142 0.5122 0.5054 0.4966 0.4907 0.48...,0.5034 0.5024 0.5015 0.4995 0.4976 0.4956 0.49...,True,41401;41402,41401: Coronary atherosclerosis of native coro...,60-69,1,,CAD_4140x
2,p010134,10134,p01,3885858_0003,125,82.0,F,0088;25000;2762;27650;2851;28521;3668;40390;40...,0.3470 0.3421 0.3372 0.3333 0.3294 0.3255 0.32...,0.2454 0.2454 0.2444 0.2444 0.2444 0.2434 0.24...,...,0.2522 0.2522 0.2532 0.2532 0.2522 0.2522 0.25...,0.7928 0.7879 0.7801 0.7693 0.7576 0.7439 0.72...,0.2600 0.2600 0.2600 0.2620 0.2639 0.2688 0.27...,True,41400,41400: Coronary atherosclerosis of unspecified...,80+,1,,CAD_4140x
3,p010152,10152,p01,3633081_0012,125,74.0,M,0389;2851;29410;3310;4019;41071;412;41401;4260...,0.5083 0.5024 0.4976 0.4917 0.4868 0.4829 0.47...,0.6628 0.6608 0.6598 0.6598 0.6598 0.6608 0.66...,...,0.5582 0.5523 0.5474 0.5425 0.5376 0.5327 0.52...,0.4487 0.4457 0.4438 0.4418 0.4418 0.4438 0.44...,0.5591 0.5621 0.5650 0.5670 0.5670 0.5670 0.56...,True,41401,41401: Coronary atherosclerosis of native coro...,70-79,1,,CAD_4140x
4,p010188,10188,p01,3840007_0001,125,63.0,M,03849;0389;0417;1122;1125;2554;2724;2738;2763;...,0.3363 0.3333 0.3294 0.3265 0.3236 0.3196 0.31...,0.2444 0.2414 0.2395 0.2395 0.2424 0.2483 0.26...,...,0.4409 0.4389 0.4370 0.4350 0.4330 0.4311 0.42...,0.3617 0.3597 0.3568 0.3548 0.3529 0.3500 0.34...,0.4653 0.5054 0.5435 0.5758 0.6041 0.6266 0.64...,True,41401,41401: Coronary atherosclerosis of native coro...,60-69,1,,CAD_4140x


In [2]:
import os
import pandas as pd
outpath = '/content/drive/MyDrive/Colab Notebooks/Final Project/Ensemble Model'
path = os.path.join(outpath, 'ppg-test-data-without-label.csv')

x = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Final Project/Ensemble Model/Copy of PPG_test_data.csv')

In [3]:
x.columns

Index(['patient_id', 'SUBJECT_ID', 'group', 'record_id', 'fs', 'AGE', 'GENDER',
       'ICD9_CODES', 'ppg_min_3', 'ppg_min_4', 'ppg_min_5', 'ppg_min_6',
       'ppg_min_7', 'ppg_min_8', 'has_4140x', '4140x_codes', 'code_meanings',
       'age_group', 'label', 'has_any_heart_disease'],
      dtype='object')

In [5]:
columns_to_delete = ['has_4140x', '4140x_codes', 'code_meanings', 'label', 'has_any_heart_disease']
x = x.drop(columns=columns_to_delete)
print("Columns after deletion:")
display(x.columns)

KeyError: "['has_4140x', '4140x_codes', 'code_meanings', 'label', 'has_any_heart_disease'] not found in axis"

In [6]:
x.to_csv(path, index=False)