In [18]:
import pandas as pd
import numpy as np

In [20]:
# ==========================================
# STEP 1: Load Datasets
# ==========================================
print("\n[STEP 1] Loading datasets...")

uci_data = pd.read_csv('UCI_data.csv')
brfss_data = pd.read_csv('brfss_data.csv', encoding='latin-1', low_memory=False)

print(f"✓ UCI dataset loaded: {uci_data.shape[0]:,} patients, {uci_data.shape[1]} features")
print(f"✓ BRFSS dataset loaded: {brfss_data.shape[0]:,} survey responses, {brfss_data.shape[1]} features")


[STEP 1] Loading datasets...
✓ UCI dataset loaded: 101,766 patients, 50 features
✓ BRFSS dataset loaded: 491,775 survey responses, 330 features


In [22]:
# ==========================================
# STEP 2: Select Relevant BRFSS Columns
# ==========================================
print("\n[STEP 2] Selecting relevant BRFSS features...")

brfss_columns = [
    # Demographics (for merging)
    'sex', 'X_age_g', 'X_race',
    
    # Health conditions
    'diabete3',      # Diabetes status
    'cvdinfr4',      # Heart attack
    'cvdstrk3',      # Stroke
    'bphigh4',       # High blood pressure
    'toldhi2',       # High cholesterol
    'asthma3',       # Asthma
    'chccopd1',      # COPD
    'havarth3',      # Arthritis
    'addepev2',      # Depression
    'chckidny',      # Kidney disease
    
    # Health behaviors
    'smoke100',      # Smoked 100+ cigarettes
    'exerany2',      # Physical activity
    
    # Healthcare access
    'hlthpln1',      # Health insurance
    'medcost',       # Could not see doctor due to cost
    'checkup1',      # Last routine checkup
    
    # Socioeconomic
    'income2',       # Income level
    'educa',         # Education level
    'employ1',       # Employment status
    
    # General health
    'genhlth',       # General health status
    'physhlth',      # Physical health (days not good in past 30 days)
    'menthlth'       # Mental health (days not good in past 30 days)
]

brfss_subset = brfss_data[brfss_columns].copy()
print(f"✓ Selected {len(brfss_columns)} relevant features from BRFSS")


[STEP 2] Selecting relevant BRFSS features...
✓ Selected 24 relevant features from BRFSS


In [24]:
# ==========================================
# STEP 3: Standardize BRFSS Demographics
# ==========================================
print("\n[STEP 3] Standardizing BRFSS demographics to match UCI format...")

# Gender mapping
brfss_subset['gender'] = brfss_subset['sex'].map({'Male': 'Male', 'Female': 'Female'})
print(f"✓ Gender mapping complete")
print(f"  - Male: {(brfss_subset['gender'] == 'Male').sum():,}")
print(f"  - Female: {(brfss_subset['gender'] == 'Female').sum():,}")

# Age group mapping
age_map = {
    'Age 18 to 24': '[18-30)',
    'Age 25 to 34': '[30-40)',
    'Age 35 to 44': '[40-50)',
    'Age 45 to 54': '[50-60)',
    'Age 55 to 64': '[60-70)',
    'Age 65 or older': '[70-80)'
}
brfss_subset['age'] = brfss_subset['X_age_g'].map(age_map)
print(f"✓ Age group mapping complete (6 categories)")

# Race/ethnicity mapping
race_map = {
    'White only, non-Hispanic': 'Caucasian',
    'Black only, non-Hispanic': 'AfricanAmerican',
    'Asian only, non-Hispanic': 'Asian',
    'Hispanic': 'Hispanic',
    'American Indian or Alaskan Native only, Non-Hispanic': 'Other',
    'American Indian/Alaskan Native only, Non-Hispanic': 'Other',
    'Other race only, non-Hispanic': 'Other',
    'Multiracial, non-Hispanic': 'Other',
    'Native Hawaiian or other Pacific Islander only, Non-Hispanic': 'Other'
}
brfss_subset['race'] = brfss_subset['X_race'].map(race_map)
print(f"✓ Race/ethnicity mapping complete")

# Remove rows with missing demographics
brfss_subset = brfss_subset.dropna(subset=['gender', 'age', 'race'])
print(f"✓ Removed rows with missing demographics: {len(brfss_subset):,} rows remaining")


[STEP 3] Standardizing BRFSS demographics to match UCI format...
✓ Gender mapping complete
  - Male: 201,313
  - Female: 290,455
✓ Age group mapping complete (6 categories)
✓ Race/ethnicity mapping complete
✓ Removed rows with missing demographics: 483,203 rows remaining


In [26]:
# ==========================================
# STEP 4: Convert BRFSS Features to Numeric
# ==========================================
print("\n[STEP 4] Converting BRFSS features to numeric values...")

# Binary Yes/No columns -> 1/0
binary_cols = ['diabete3', 'cvdinfr4', 'cvdstrk3', 'bphigh4', 'toldhi2', 
               'asthma3', 'chccopd1', 'havarth3', 'addepev2', 'chckidny',
               'smoke100', 'exerany2', 'hlthpln1', 'medcost']
for col in binary_cols:
    brfss_subset[col] = brfss_subset[col].map({'Yes': 1, 'No': 0})
print(f"✓ Converted {len(binary_cols)} binary features (Yes/No -> 1/0)")

# General health (ordinal: 1=Excellent to 5=Poor)
brfss_subset['genhlth'] = brfss_subset['genhlth'].map({
    'Excellent': 1, 'Very good': 2, 'Good': 3, 'Fair': 4, 'Poor': 5
})
print(f"✓ Converted general health to ordinal scale (1=Excellent, 5=Poor)")

# Health days (numeric)
brfss_subset['physhlth'] = pd.to_numeric(brfss_subset['physhlth'], errors='coerce')
brfss_subset['menthlth'] = pd.to_numeric(brfss_subset['menthlth'], errors='coerce')
print(f"✓ Converted physical/mental health days to numeric")

# Income (ordinal: 1=lowest to 8=highest)
income_map = {
    'Less than $10,000': 1,
    'Less than $15,000 ($10,000 to less than $15,000)': 2,
    'Less than $20,000 ($15,000 to less than $20,000)': 3,
    'Less than $25,000 ($20,000 to less than $25,000)': 4,
    'Less than $35,000 ($25,000 to less than $35,000)': 5,
    'Less than $50,000 ($35,000 to less than $50,000)': 6,
    'Less than $75,000 ($50,000 to less than $75,000)': 7,
    '$75,000 or more': 8
}
brfss_subset['income2'] = brfss_subset['income2'].map(income_map)
print(f"✓ Converted income to ordinal scale (1=<$10k, 8=$75k+)")

# Education (ordinal: 1=lowest to 6=highest)
educ_map = {
    'Never attended school or only kindergarten': 1,
    'Grades 1 through 8 (Elementary)': 2,
    'Grades 9 through 11 (Some high school)': 3,
    'Grade 12 or GED (High school graduate)': 4,
    'College 1 year to 3 years (Some college or technical school)': 5,
    'College 4 years or more (College graduate)': 6
}
brfss_subset['educa'] = brfss_subset['educa'].map(educ_map)
print(f"✓ Converted education to ordinal scale (1=elementary, 6=college grad)")

# Checkup recency (ordinal: 1=most recent to 5=never)
checkup_map = {
    'Within past year (anytime less than 12 months ago)': 1,
    'Within past 2 years (1 year but less than 2 years ago)': 2,
    'Within past 5 years (2 years but less than 5 years ago)': 3,
    '5 or more years ago': 4,
    'Never': 5
}
brfss_subset['checkup1'] = brfss_subset['checkup1'].map(checkup_map)
print(f"✓ Converted checkup recency to ordinal scale (1=within year, 5=never)")

# Employment (binary: 1=employed, 0=not employed)
brfss_subset['employ1'] = brfss_subset['employ1'].map({
    'Employed for wages': 1, 'Self-employed': 1,
    'Out of work for 1 year or more': 0, 'Out of work for less than 1 year': 0,
    'A homemaker': 0, 'A student': 0, 'Retired': 0, 'Unable to work': 0
})
print(f"✓ Converted employment to binary (1=employed, 0=not employed)")


[STEP 4] Converting BRFSS features to numeric values...
✓ Converted 14 binary features (Yes/No -> 1/0)
✓ Converted general health to ordinal scale (1=Excellent, 5=Poor)
✓ Converted physical/mental health days to numeric
✓ Converted income to ordinal scale (1=<$10k, 8=$75k+)
✓ Converted education to ordinal scale (1=elementary, 6=college grad)
✓ Converted checkup recency to ordinal scale (1=within year, 5=never)
✓ Converted employment to binary (1=employed, 0=not employed)


In [28]:
# ==========================================
# STEP 5: Aggregate BRFSS by Demographics
# ==========================================
print("\n[STEP 5] Aggregating BRFSS data by demographic groups (race/age/gender)...")

# Create demographic key for grouping
brfss_subset['demo_key'] = (brfss_subset['race'].astype(str) + '_' + 
                             brfss_subset['age'].astype(str) + '_' + 
                             brfss_subset['gender'].astype(str))

# Get numeric columns for aggregation
numeric_cols = [col for col in brfss_subset.columns 
                if col not in ['sex', 'X_age_g', 'X_race', 'gender', 'age', 'race', 'demo_key']
                and pd.api.types.is_numeric_dtype(brfss_subset[col])]

# Aggregate by taking mean for each demographic group
brfss_aggregated = brfss_subset.groupby('demo_key')[numeric_cols].mean().reset_index()
brfss_aggregated.columns = ['demo_key'] + [f'pop_{col}' for col in numeric_cols]

print(f"✓ Aggregated BRFSS data into {len(brfss_aggregated)} demographic groups")
print(f"✓ Created {len(numeric_cols)} population-level features (prefixed with 'pop_')")

print(f"\nDemographic groups in BRFSS:")
print(f"  - Total unique groups: {len(brfss_aggregated)}")
for demo_key in sorted(brfss_aggregated['demo_key'].head(10)):
    print(f"    • {demo_key}")
print(f"    ... and {len(brfss_aggregated) - 10} more")


[STEP 5] Aggregating BRFSS data by demographic groups (race/age/gender)...
✓ Aggregated BRFSS data into 60 demographic groups
✓ Created 21 population-level features (prefixed with 'pop_')

Demographic groups in BRFSS:
  - Total unique groups: 60
    • AfricanAmerican_[18-30)_Female
    • AfricanAmerican_[18-30)_Male
    • AfricanAmerican_[30-40)_Female
    • AfricanAmerican_[30-40)_Male
    • AfricanAmerican_[40-50)_Female
    • AfricanAmerican_[40-50)_Male
    • AfricanAmerican_[50-60)_Female
    • AfricanAmerican_[50-60)_Male
    • AfricanAmerican_[60-70)_Female
    • AfricanAmerican_[60-70)_Male
    ... and 50 more


In [30]:
# ==========================================
# STEP 6: Prepare UCI Data for Merging
# ==========================================
print("\n[STEP 6] Preparing UCI data for merge...")

# Map UCI age groups to match BRFSS age groups
def map_uci_age(age_str):
    """Map UCI age groups to BRFSS age groups"""
    if pd.isna(age_str):
        return None
    
    # Map under 18 to youngest BRFSS group (18-30)
    if age_str in ['[0-10)', '[10-20)']:
        return '[18-30)'
    # Map 20-30 to 18-30
    elif age_str == '[20-30)':
        return '[18-30)'
    # Map 80+ to oldest BRFSS group (70-80, which represents 65+)
    elif age_str in ['[80-90)', '[90-100)']:
        return '[70-80)'
    # Everything else stays the same
    else:
        return age_str

uci_data['age_mapped'] = uci_data['age'].apply(map_uci_age)
print(f"✓ Mapped UCI age groups to match BRFSS categories")

# Clean UCI demographics
uci_data['gender'] = uci_data['gender'].str.strip().str.capitalize()
uci_data['race'] = uci_data['race'].replace({'?': np.nan})

# Remove rows with missing race (cannot be merged)
print(f"✓ Original UCI dataset: {len(uci_data):,} patients")
uci_data_clean = uci_data[uci_data['race'].notna()].copy()
print(f"✓ Removed {len(uci_data) - len(uci_data_clean):,} patients with missing race")
print(f"✓ Clean UCI dataset: {len(uci_data_clean):,} patients")

# Create demographic key
uci_data_clean['demo_key'] = (uci_data_clean['race'].astype(str) + '_' + 
                               uci_data_clean['age_mapped'].astype(str) + '_' + 
                               uci_data_clean['gender'].astype(str))

print(f"✓ Created demographic keys for merging")
print(f"  - Unique demographic groups in UCI: {uci_data_clean['demo_key'].nunique()}")


[STEP 6] Preparing UCI data for merge...
✓ Mapped UCI age groups to match BRFSS categories
✓ Original UCI dataset: 101,766 patients
✓ Removed 2,273 patients with missing race
✓ Clean UCI dataset: 99,493 patients
✓ Created demographic keys for merging
  - Unique demographic groups in UCI: 61


In [32]:
# ==========================================
# STEP 7: Merge Datasets
# ==========================================
print("\n[STEP 7] Merging UCI and BRFSS datasets...")

merged_data = uci_data_clean.merge(brfss_aggregated, on='demo_key', how='left')

merge_success_rate = (merged_data['pop_genhlth'].notna().sum() / len(merged_data)) * 100
print(f"✓ Merge complete!")
print(f"  - Merged dataset shape: {merged_data.shape[0]:,} patients, {merged_data.shape[1]} features")
print(f"  - Merge success rate: {merge_success_rate:.2f}%")

# Remove patients without BRFSS data
patients_before = len(merged_data)
merged_data = merged_data[merged_data['pop_genhlth'].notna()].copy()
patients_removed = patients_before - len(merged_data)

if patients_removed > 0:
    print(f"✓ Removed {patients_removed} patient(s) without BRFSS data")


[STEP 7] Merging UCI and BRFSS datasets...
✓ Merge complete!
  - Merged dataset shape: 99,493 patients, 73 features
  - Merge success rate: 100.00%
✓ Removed 1 patient(s) without BRFSS data


In [34]:
# ==========================================
# STEP 8: Save Final Dataset
# ==========================================
print("\n[STEP 8] Saving final merged dataset...")

merged_data.to_csv('merged_uci_brfss.csv', index=False)
print(f"✓ Saved to 'merged_uci_brfss.csv'")


[STEP 8] Saving final merged dataset...
✓ Saved to 'merged_uci_brfss.csv'


In [40]:
# ==========================================
# STEP 9: Summary Statistics
# ==========================================
print("\n" + "="*60)
print("MERGE SUMMARY")
print("="*60)

print(f"\nFinal Dataset:")
print(f"  - Total patients: {len(merged_data):,}")
print(f"  - Total features: {merged_data.shape[1]}")
print(f"  - Original UCI features: {uci_data_clean.shape[1]}")
print(f"  - New BRFSS features added: {len([c for c in merged_data.columns if c.startswith('pop_')])}")

print(f"\nNew Population-Level Features (21 total):")
pop_features = [c for c in merged_data.columns if c.startswith('pop_')]
for i, feat in enumerate(pop_features, 1):
    print(f"  {i:2d}. {feat}")

print(f"\nDemographic Distribution:")
print(f"  Race:")
for race, count in merged_data['race'].value_counts().items():
    pct = (count / len(merged_data)) * 100
    print(f"    - {race}: {count:,} ({pct:.1f}%)")

print(f"\n  Gender:")
for gender, count in merged_data['gender'].value_counts().items():
    pct = (count / len(merged_data)) * 100
    print(f"    - {gender}: {count:,} ({pct:.1f}%)")

print(f"\n  Age Groups:")
for age, count in merged_data['age_mapped'].value_counts().sort_index().items():
    pct = (count / len(merged_data)) * 100
    print(f"    - {age}: {count:,} ({pct:.1f}%)")

print(f"\nReadmission Distribution:")
for outcome, count in merged_data['readmitted'].value_counts().items():
    pct = (count / len(merged_data)) * 100
    print(f"  - {outcome}: {count:,} ({pct:.1f}%)")

print("\n" + "="*60)
print("DATA MERGE COMPLETE")
print("="*60)


MERGE SUMMARY

Final Dataset:
  - Total patients: 99,492
  - Total features: 73
  - Original UCI features: 52
  - New BRFSS features added: 21

New Population-Level Features (21 total):
   1. pop_diabete3
   2. pop_cvdinfr4
   3. pop_cvdstrk3
   4. pop_bphigh4
   5. pop_toldhi2
   6. pop_asthma3
   7. pop_chccopd1
   8. pop_havarth3
   9. pop_addepev2
  10. pop_chckidny
  11. pop_smoke100
  12. pop_exerany2
  13. pop_hlthpln1
  14. pop_medcost
  15. pop_checkup1
  16. pop_income2
  17. pop_educa
  18. pop_employ1
  19. pop_genhlth
  20. pop_physhlth
  21. pop_menthlth

Demographic Distribution:
  Race:
    - Caucasian: 76,099 (76.5%)
    - AfricanAmerican: 19,210 (19.3%)
    - Hispanic: 2,037 (2.0%)
    - Other: 1,505 (1.5%)
    - Asian: 641 (0.6%)

  Gender:
    - Female: 53,575 (53.8%)
    - Male: 45,917 (46.2%)

  Age Groups:
    - [18-30): 2,453 (2.5%)
    - [30-40): 3,699 (3.7%)
    - [40-50): 9,465 (9.5%)
    - [50-60): 16,895 (17.0%)
    - [60-70): 21,988 (22.1%)
    - [70-80):