In [85]:
import pandas as pd

In [86]:
df = pd.read_csv("/Users/rahulramakrishnan/Documents/love_hate_relationship/competitions/cmda_competition/datasets/custom/final_data_2.csv")

In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Define relevant features and target
chronic_conditions = [
    'asthma_episode/attack_all_ages',
    'cervical_cancer_all_ages',
    'copd__emphysema__chronic_bronchitis_all_ages',
    'coronary_heart_disease_all_ages',
    'diagnosed_diabetes__self-reported_all_ages',
    'hypertension_diagnosis__self-reported_all_ages',
    'obesity__self-reported_all_ages'
]

# Exclude race-specific deaths from features when predicting total deaths
features = chronic_conditions
target = 'all_infant_deaths_all_mothers'

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), features)  # Only scale numerical features
    ]
)

# Split data
X = df[features]
y = df[target]
X_preprocessed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and Evaluate
y_pred = rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, y_pred)
rf_r2 = r2_score(y_test, y_pred)

print(f"Random Forest Mean Absolute Error: {rf_mae:.2f}")
print(f"Random Forest R² Score: {rf_r2:.2f}")

# Feature Importances
feature_names = features  # Feature names come directly from the features list
importances = rf_model.feature_importances_
feature_importance_table = pd.DataFrame({
    'Feature': feature_names,
    'Importance Score': importances
}).sort_values(by='Importance Score', ascending=False)

# Display Feature Importance Table
print("\nFeature Importance Table:")
print(feature_importance_table.to_string(index=False))

# Prediction vs Actual Table
pred_actual_table = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Difference': y_test - y_pred
}).reset_index(drop=True)

# Display Predictions vs Actual Table
print("\nPredictions vs Actual Table:")
print(pred_actual_table.head(10))  # Display first 10 rows


Random Forest Mean Absolute Error: 0.08
Random Forest R² Score: 0.66

Feature Importance Table:
                                       Feature  Importance Score
                      cervical_cancer_all_ages          0.235809
               coronary_heart_disease_all_ages          0.210083
hypertension_diagnosis__self-reported_all_ages          0.156142
  copd__emphysema__chronic_bronchitis_all_ages          0.148668
               obesity__self-reported_all_ages          0.115898
    diagnosed_diabetes__self-reported_all_ages          0.072135
                asthma_episode/attack_all_ages          0.061266

Predictions vs Actual Table:
   Actual  Predicted  Difference
0     5.7      5.659       0.041
1     5.4      5.516      -0.116


In [88]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

racial_groups = ['white', 'black', 'asian_or_pacific_islander', 'all_races__hispanic']

# Define relevant features and target
chronic_conditions = [
    'asthma_episode/attack_all_ages',
    'cervical_cancer_all_ages',
    'copd__emphysema__chronic_bronchitis_all_ages',
    'coronary_heart_disease_all_ages',
    'current_cigarette_smoking_all_ages',
    'diagnosed_diabetes__self-reported_all_ages',
    'hypertension_diagnosis__self-reported_all_ages',
    'obesity__self-reported_all_ages'
]

# Loop through each racial group
for race in racial_groups:
    # Filter rows with relevant data for the specific racial group
    race_df = df[df[f'all_infant_deaths_{race}'] > 0]
    
    if race_df.empty:
        print(f"No data available for {race} group.")
        continue
    
    # Split data
    X_race = race_df[chronic_conditions]
    y_race = race_df[f'all_infant_deaths_{race}']
    
    # Scale features
    scaler = StandardScaler()
    X_race_scaled = scaler.fit_transform(X_race)
    
    X_train_race, X_test_race, y_train_race, y_test_race = train_test_split(
        X_race_scaled, y_race, test_size=0.2, random_state=42
    )
    
    # Train model
    race_model = RandomForestRegressor(n_estimators=100, random_state=42)
    race_model.fit(X_train_race, y_train_race)
    
    # Evaluate
    y_pred_race = race_model.predict(X_test_race)
    print(f"\nRace: {race.capitalize()}")
    print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test_race, y_pred_race):.2f}")
    print(f"R² Score: {r2_score(y_test_race, y_pred_race):.2f}")
    
    # Feature Importance
    importances = race_model.feature_importances_
    feature_importance_table = pd.DataFrame({
        'Feature': chronic_conditions,
        'Importance Score': importances
    }).sort_values(by='Importance Score', ascending=False)
    
    # Display Feature Importance as a Table
    print(f"\nFeature Importance for {race.capitalize()} Mothers:")
    print(feature_importance_table.to_string(index=False))



Race: White
Mean Absolute Error (MAE): 0.04
R² Score: 0.46

Feature Importance for White Mothers:
                                       Feature  Importance Score
  copd__emphysema__chronic_bronchitis_all_ages          0.343478
               coronary_heart_disease_all_ages          0.118841
    diagnosed_diabetes__self-reported_all_ages          0.117391
                asthma_episode/attack_all_ages          0.108696
                      cervical_cancer_all_ages          0.094203
               obesity__self-reported_all_ages          0.085507
hypertension_diagnosis__self-reported_all_ages          0.068116
            current_cigarette_smoking_all_ages          0.063768

Race: Black
Mean Absolute Error (MAE): 0.04
R² Score: 0.00

Feature Importance for Black Mothers:
                                       Feature  Importance Score
  copd__emphysema__chronic_bronchitis_all_ages          0.308889
            current_cigarette_smoking_all_ages          0.117647
                asthma