# GENHLTH Demographic Analysis
## Predicting General Health Status from Demographics using Random Forest

This notebook analyzes the relationship between demographic variables and general health status (GENHLTH) in the BRFSS dataset using a Random Forest machine learning model.

## Setup and Data Loading

In [None]:
import os
import sys
import subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

def is_colab():
    return 'google.colab' in str(get_ipython())

# Setup for Google Colab environment only
if is_colab():
    GIT_REPO_URL = 'https://github.com/sksizer/dat490.git'
    LOCAL_DIR = '/content/code/dat490'

    if not os.path.exists(LOCAL_DIR):
        print(f"Cloning repo into {LOCAL_DIR}...")
        subprocess.run(['git', 'clone', GIT_REPO_URL, LOCAL_DIR], check=True)
    else:
        print(f"Repo already exists at {LOCAL_DIR}, pulling latest changes...")
        subprocess.run(['git', '-C', LOCAL_DIR, 'pull'], check=True)

    if LOCAL_DIR not in sys.path:
        sys.path.insert(0, LOCAL_DIR)
        print(f"Added {LOCAL_DIR} to sys.path")

    # Import dat490 package
    import dat490
else:
    # Running locally - assume dat490 is already available
    import dat490

In [None]:
# Examine GENHLTH variable
if 'GENHLTH' in df.columns:
    genhlth_meta = metadata.get('GENHLTH')
    if genhlth_meta:
        print(f"Column: {genhlth_meta.sas_variable_name}")
        print(f"Label: {genhlth_meta.label}")
        print(f"Question: {genhlth_meta.question}")
        print(f"\nValue mappings:")
        for value, description in genhlth_meta.value_lookup.items():
            print(f"  {value}: {description}")
    
    # Show value counts
    print("\nGENHLTH Distribution:")
    genhlth_counts = df['GENHLTH'].value_counts().sort_index()
    for value, count in genhlth_counts.items():
        if not pd.isna(value) and genhlth_meta:
            description = genhlth_meta.value_lookup.get(int(value), f"Code {value}")
            print(f"  {int(value)}: {description} (Count: {count:,})")
        else:
            print(f"  {value}: Missing (Count: {count:,})")
else:
    print("GENHLTH column not found. Searching for similar health status variables...")
    health_cols = [col for col in df.columns if 'HLTH' in col or 'HEALTH' in col]
    print(f"Health-related columns found: {health_cols}")

# Store GENHLTH metadata for later use in confusion matrix
genhlth_metadata = metadata.get('GENHLTH') if 'GENHLTH' in df.columns else None

## Explore Target Variable: GENHLTH

In [None]:
# Examine GENHLTH variable
if 'GENHLTH' in df.columns:
    genhlth_meta = metadata.get('GENHLTH')
    if genhlth_meta:
        print(f"Column: {genhlth_meta.sas_variable_name}")
        print(f"Label: {genhlth_meta.label}")
        print(f"Question: {genhlth_meta.question}")
        print(f"\nValue mappings:")
        for value, description in genhlth_meta.value_lookup.items():
            print(f"  {value}: {description}")
    
    # Show value counts
    print("\nGENHLTH Distribution:")
    genhlth_counts = df['GENHLTH'].value_counts().sort_index()
    for value, count in genhlth_counts.items():
        if not pd.isna(value) and genhlth_meta:
            description = genhlth_meta.value_lookup.get(int(value), f"Code {value}")
            print(f"  {int(value)}: {description} (Count: {count:,})")
        else:
            print(f"  {value}: Missing (Count: {count:,})")
else:
    print("GENHLTH column not found. Searching for similar health status variables...")
    health_cols = [col for col in df.columns if 'HLTH' in col or 'HEALTH' in col]
    print(f"Health-related columns found: {health_cols}")

## Identify Demographics Features

In [None]:
# Get Demographics section features
demographics_columns = [
    col for col, meta in metadata.items()
    if hasattr(meta, 'section_name') and 'Demographics' in meta.section_name
]

print(f"Demographics columns ({len(demographics_columns)}): {demographics_columns}")

# Also look for calculated demographic variables
calc_demo_columns = [
    col for col in df.columns 
    if col.startswith('_') and any(demo in col.upper() for demo in ['AGE', 'RACE', 'SEX', 'INCOME', 'EDUC'])
]

print(f"\nCalculated demographic columns: {calc_demo_columns}")

# Combine and filter to available columns
all_demo_features = list(set(demographics_columns + calc_demo_columns))
available_demo_features = [col for col in all_demo_features if col in df.columns]

print(f"\nFinal demographics features to use ({len(available_demo_features)}): {available_demo_features}")

In [None]:
# Create visualizations for key demographic variables
key_demo_vars = ['MARITAL', 'EDUCA', 'EMPLOY1', 'INCOME3']
available_key_vars = [var for var in key_demo_vars if var in df.columns]

fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()

for i, var in enumerate(available_key_vars[:4]):
    if var in metadata:
        var_meta = metadata[var]
        value_counts = df[var].value_counts().head(8)
        
        # Map values to descriptions
        labels = []
        for val in value_counts.index:
            if not pd.isna(val):
                desc = var_meta.value_lookup.get(int(val), f"Code {val}")
                # Truncate long labels
                if len(desc) > 30:
                    desc = desc[:27] + "..."
                labels.append(desc)
            else:
                labels.append("Missing")
        
        axes[i].barh(range(len(labels)), value_counts.values)
        axes[i].set_yticks(range(len(labels)))
        axes[i].set_yticklabels(labels, fontsize=8)
        axes[i].set_title(f"{var}: {var_meta.label}", fontsize=10)
        axes[i].set_xlabel("Count")

plt.tight_layout()
plt.show()

## Data Preprocessing for Machine Learning

In [None]:
# Prepare data for machine learning
target_col = 'GENHLTH'

if target_col not in df.columns:
    print(f"Target variable {target_col} not found!")
    # Look for alternative health status variables
    health_alternatives = [col for col in df.columns if 'HLTH' in col]
    print(f"Available health columns: {health_alternatives}")
    if health_alternatives:
        target_col = health_alternatives[0]
        print(f"Using {target_col} as target variable instead")
    else:
        raise ValueError("No suitable target variable found")

# Select demographics features that are available in the dataset
feature_cols = [col for col in available_demo_features if col in df.columns and col != target_col]

print(f"Target variable: {target_col}")
print(f"Feature variables ({len(feature_cols)}): {feature_cols}")

# Create analysis dataset
analysis_cols = [target_col] + feature_cols
analysis_df = df[analysis_cols].copy()

print(f"\nAnalysis dataset shape: {analysis_df.shape}")
print(f"Missing values per column:")
missing_counts = analysis_df.isnull().sum()
for col in analysis_cols:
    missing_pct = missing_counts[col] / len(analysis_df) * 100
    print(f"  {col}: {missing_counts[col]:,} ({missing_pct:.1f}%)")

In [None]:
# Handle missing values and prepare for modeling
# Drop rows with missing target variable
model_df = analysis_df.dropna(subset=[target_col]).copy()
print(f"After dropping rows with missing target: {model_df.shape}")

# For features, we'll handle missing values by either dropping columns with too many missing values
# or filling with mode for categorical variables
missing_threshold = 0.3  # Drop columns with >30% missing

cols_to_keep = []
for col in feature_cols:
    missing_pct = model_df[col].isnull().sum() / len(model_df)
    if missing_pct <= missing_threshold:
        cols_to_keep.append(col)
    else:
        print(f"Dropping {col} due to {missing_pct:.1%} missing values")

feature_cols = cols_to_keep
print(f"\nFeatures after missing value filtering: {feature_cols}")

# Fill remaining missing values with mode (most common value)
for col in feature_cols:
    if model_df[col].isnull().any():
        mode_value = model_df[col].mode().iloc[0] if not model_df[col].mode().empty else 0
        model_df[col] = model_df[col].fillna(mode_value)
        print(f"Filled {col} missing values with mode: {mode_value}")

print(f"\nFinal model dataset shape: {model_df.shape}")
print(f"Missing values remaining: {model_df.isnull().sum().sum()}")

## Random Forest Model

In [None]:
# Prepare features and target
X = model_df[feature_cols].copy()
y = model_df[target_col].copy()

# Convert to numeric if needed
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
    else:
        X[col] = pd.to_numeric(X[col], errors='coerce')

# Ensure target is numeric
if y.dtype == 'object':
    le_target = LabelEncoder()
    y = le_target.fit_transform(y.astype(str))
else:
    y = pd.to_numeric(y, errors='coerce')

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target classes: {np.unique(y)}")
print(f"Class distribution:")
unique, counts = np.unique(y, return_counts=True)
for class_val, count in zip(unique, counts):
    print(f"  Class {class_val}: {count:,} ({count/len(y)*100:.1f}%)")

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")

# Train Random Forest model
print("\nTraining Random Forest model...")
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
print("Model training completed!")

## Model Evaluation

In [None]:
# Make predictions
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.3f}")

# Cross-validation score
print("\nPerforming 5-fold cross-validation...")
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"CV Accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix with proper labels
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)

# Create proper class labels using GENHLTH metadata
if genhlth_metadata and hasattr(genhlth_metadata, 'value_lookup'):
    # Get unique values from the test set to know which classes are present
    unique_classes = sorted(np.unique(np.concatenate([y_test, y_pred])))
    class_labels = []
    for class_val in unique_classes:
        # For GENHLTH, the values are already meaningful (1=Excellent, 2=Very good, etc.)
        # Since we didn't use label encoding (target is already numeric), use values directly
        description = genhlth_metadata.value_lookup.get(int(class_val), f"Code {class_val}")
        class_labels.append(description)
else:
    # Fallback to generic labels
    class_labels = [f'Class {i}' for i in range(len(np.unique(y)))]

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_labels,
            yticklabels=class_labels)
plt.title('Confusion Matrix\\nPredicting General Health Status from Demographics')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## Feature Importance Analysis

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance Rankings:")
for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
    print(f"{i:2d}. {row['feature']:15s}: {row['importance']:.4f}")

# Plot feature importance
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(min(15, len(feature_importance)))
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Random Forest Feature Importance\n(Predicting General Health Status)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Model Interpretation and Insights

In [None]:
# Analyze top features with their metadata
print("Top 5 Most Important Features Analysis:")
print("=" * 50)

for i, (_, row) in enumerate(feature_importance.head(5).iterrows(), 1):
    feature_name = row['feature']
    importance = row['importance']
    
    print(f"\n{i}. {feature_name} (Importance: {importance:.4f})")
    
    if feature_name in metadata:
        meta = metadata[feature_name]
        print(f"   Label: {meta.label}")
        print(f"   Question: {meta.question}")
        
        # Show value distribution
        if feature_name in model_df.columns:
            value_counts = model_df[feature_name].value_counts().head(5)
            print(f"   Top values:")
            for val, count in value_counts.items():
                desc = meta.value_lookup.get(val, f"Code {val}")
                pct = count / len(model_df) * 100
                print(f"     {val}: {desc} ({count:,}, {pct:.1f}%)")
    else:
        print(f"   (No metadata available for calculated variable)")

In [None]:
# Model performance summary
print("\n" + "=" * 60)
print("MODEL PERFORMANCE SUMMARY")
print("=" * 60)
print(f"Dataset: BRFSS 2023")
print(f"Target Variable: {target_col} (General Health Status)")
print(f"Number of Features: {len(feature_cols)}")
print(f"Training Samples: {X_train.shape[0]:,}")
print(f"Test Samples: {X_test.shape[0]:,}")
print(f"\nModel: Random Forest Classifier")
print(f"Test Accuracy: {accuracy:.3f}")
print(f"Cross-Validation Accuracy: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
print(f"\nTop 3 Most Important Demographics:")
for i, (_, row) in enumerate(feature_importance.head(3).iterrows(), 1):
    print(f"  {i}. {row['feature']} (importance: {row['importance']:.3f})")
print("\nConclusion: Demographics show predictive power for general health status,")
print("with the most important factors being the top-ranked features above.")

## Hyperparameter Tuning (Optional)

In [None]:
# Optional: Hyperparameter tuning with GridSearchCV
# Uncomment and run this cell for better model performance

# print("Performing hyperparameter tuning...")
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [5, 10, 15, None],
#     'min_samples_split': [5, 10, 20],
#     'min_samples_leaf': [2, 5, 10]
# }

# # Use smaller sample for grid search to speed up
# sample_size = min(10000, len(X_train))
# X_sample = X_train.sample(n=sample_size, random_state=42)
# y_sample = y_train[X_sample.index]

# grid_search = GridSearchCV(
#     RandomForestClassifier(random_state=42, n_jobs=-1),
#     param_grid,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1,
#     verbose=1
# )

# grid_search.fit(X_sample, y_sample)
# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best CV score: {grid_search.best_score_:.3f}")

# # Train final model with best parameters
# best_rf = grid_search.best_estimator_
# best_rf.fit(X_train, y_train)
# best_accuracy = accuracy_score(y_test, best_rf.predict(X_test))
# print(f"Tuned model test accuracy: {best_accuracy:.3f}")