In [877]:
# Model 1:
# Outcome Variables (Targets):
# Maternal mortality rate
# Infant mortality rate
# Predictor Variables:
# Income metrics (e.g., average income, income inequality measures)
# Educational attainment (e.g., college attainment rates)
# Interaction term: Income × Educational Attainment


In [878]:
import pandas as pd

In [879]:
df = pd.read_csv("/Users/rahulramakrishnan/Documents/love_hate_relationship/competitions/cmda_competition/datasets/custom/final_data.csv")

In [880]:
columns_list = df.columns.tolist()
print(columns_list)

['year', 'all_infant_deaths_all_mothers', 'all_infant_deaths_all_races__hispanic', 'all_infant_deaths_all_races__hispanic__central_and_south_american', 'all_infant_deaths_all_races__hispanic__cuban', 'all_infant_deaths_all_races__hispanic__mexican', 'all_infant_deaths_all_races__hispanic__other', 'all_infant_deaths_all_races__hispanic__puerto_rican', 'all_infant_deaths_american_indian_and_alaska_native', 'all_infant_deaths_american_indian_and_alaska_native__non-hispanic', 'all_infant_deaths_american_indian_and_alaska_native_only', 'all_infant_deaths_american_indian_and_alaska_native_only__non-hispanic', 'all_infant_deaths_asian_only__non-hispanic', 'all_infant_deaths_asian_or_pacific_islander', 'all_infant_deaths_asian_or_pacific_islander__non-hispanic', 'all_infant_deaths_black', 'all_infant_deaths_black__non-hispanic', 'all_infant_deaths_black_only', 'all_infant_deaths_black_only__non-hispanic', 'all_infant_deaths_native_hawaiian_or_other_pacific_islander_only__non-hispanic', 'all_in

In [881]:
import numpy as np
from scipy.stats import zscore
import pandas as pd

def fill_missing_values(df):
    """
    Fill missing values in the DataFrame.
    - Numeric columns: Use group-based median for better precision.
    - Categorical columns: Use mode.
    """
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']:  # Numeric columns
            # Use group-based median for imputation
            if 'year' in df.columns:  # Check if "year" column exists for grouping
                df[col] = df.groupby('year')[col].transform(lambda x: x.fillna(x.median()))
            else:
                df[col].fillna(df[col].median(), inplace=True)
        else:  # Non-numeric columns
            df[col].fillna(df[col].mode()[0], inplace=True)  # Use mode for categorical columns
    return df

def handle_outliers(df, threshold=3):
    """
    Handle outliers in the DataFrame.
    - Replace extreme outliers with group-based median values for numeric columns.
    """
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns  # Select numeric columns
    for col in numeric_cols:
        # Calculate Z-scores
        if df[col].notnull().sum() > 0:  # Ensure column has non-null values
            z_scores = zscore(df[col].dropna())
            outliers = np.abs(z_scores) > threshold  # Identify outliers beyond threshold
            if outliers.any():
                # Replace outliers with group-based median if 'year' exists, else global median
                if 'year' in df.columns:
                    median = df.groupby('year')[col].transform('median')
                    df.loc[outliers, col] = median
                else:
                    median = df[col].median()
                    df.loc[outliers, col] = median
    return df

# Apply cleaning steps
# Load your DataFrame here

# Handle missing values and outliers
df = fill_missing_values(df)
df = handle_outliers(df)

# Validate the cleaning
print("Missing values after cleaning:")
print(df.isnull().sum())
print("\nSummary statistics after outlier handling:")
print(df.describe())

Missing values after cleaning:
year                                                                 0
all_infant_deaths_all_mothers                                        0
all_infant_deaths_all_races__hispanic                                0
all_infant_deaths_all_races__hispanic__central_and_south_american    0
all_infant_deaths_all_races__hispanic__cuban                         0
                                                                    ..
college_mkt_pacis                                                    0
college_dif_pacis                                                    0
college_col_twora                                                    0
college_mkt_twora                                                    0
college_dif_twora                                                    0
Length: 155, dtype: int64

Summary statistics after outlier handling:
             year  all_infant_deaths_all_mothers  \
count     8.00000                       8.000000   
mean   2020.50

In [882]:
import pandas as pd
import numpy as np

# Assuming `df` is the DataFrame loaded with your dataset

# 1. Income-Education Interaction Terms
for group, col_suffix in [('white', 'white'), ('black', 'black'), ('asian', 'asian'), ('hispanic', 'hispa')]:
    # Income-to-college ratios
    df[f'{group}_income_college_ratio'] = df[f'{group}_weekly_wage'] / (df[f'college_col_{col_suffix}'] + 1e-6)

    # Interaction between income and education levels
    df[f'{group}_income_edu_interaction'] = df[f'{group}_weekly_wage'] * df[f'college_col_{col_suffix}']

# 2. Mortality Metrics
# Neonatal vs. Postneonatal deaths ratio
if 'neonatal_deaths_all_mothers' in df.columns and 'postneonatal_deaths_all_mothers' in df.columns:
    df['neonatal_to_postneonatal_ratio'] = df['neonatal_deaths_all_mothers'] / (df['postneonatal_deaths_all_mothers'] + 1e-6)

# Maternal mortality per live birth
if 'Maternal Deaths' in df.columns and 'Live Births' in df.columns:
    df['maternal_mortality_per_birth'] = df['Maternal Deaths'] / (df['Live Births'] + 1e-6)

# Total infant mortality for all races
infant_death_columns = [col for col in df.columns if 'all_infant_deaths' in col]
df['total_infant_mortality'] = df[infant_death_columns].sum(axis=1)

# 3. Racial Disparity Metrics
# Black vs. White infant mortality ratio
if 'all_infant_deaths_black' in df.columns and 'all_infant_deaths_white' in df.columns:
    df['black_white_infant_mortality_ratio'] = df['all_infant_deaths_black'] / (df['all_infant_deaths_white'] + 1e-6)

# Wage gaps between racial groups
if 'white_weekly_wage' in df.columns and 'black_weekly_wage' in df.columns:
    df['white_black_wage_gap'] = df['white_weekly_wage'] - df['black_weekly_wage']

# 4. Income Inequality Metrics
wage_columns = [f'{group}_weekly_wage' for group in ['white', 'black', 'asian', 'hispanic']]
if all(col in df.columns for col in wage_columns):
    df['income_inequality'] = df[wage_columns].max(axis=1) / (df[wage_columns].min(axis=1) + 1e-6)

# 5. Chronic Risk Aggregates
chronic_columns = [
    'obesity__self-reported_all_ages', 'current_cigarette_smoking_all_ages', 'hypertension_diagnosis__self-reported_all_ages'
]
if all(col in df.columns for col in chronic_columns):
    df['chronic_risk_index'] = df[chronic_columns].sum(axis=1) / (df['Live Births'] + 1e-6)

# 6. Log Transformations (optional for skewed variables)
log_transform_columns = ['total_weekly_wage', 'Maternal Mortality Rate']
for col in log_transform_columns:
    if col in df.columns:
        df[f'log_{col}'] = np.log1p(df[col])


print("Feature Engineering Completed. Processed DataFrame saved.")


Feature Engineering Completed. Processed DataFrame saved.


  df[f'{group}_income_college_ratio'] = df[f'{group}_weekly_wage'] / (df[f'college_col_{col_suffix}'] + 1e-6)
  df[f'{group}_income_edu_interaction'] = df[f'{group}_weekly_wage'] * df[f'college_col_{col_suffix}']
  df[f'{group}_income_college_ratio'] = df[f'{group}_weekly_wage'] / (df[f'college_col_{col_suffix}'] + 1e-6)
  df[f'{group}_income_edu_interaction'] = df[f'{group}_weekly_wage'] * df[f'college_col_{col_suffix}']
  df[f'{group}_income_college_ratio'] = df[f'{group}_weekly_wage'] / (df[f'college_col_{col_suffix}'] + 1e-6)
  df[f'{group}_income_edu_interaction'] = df[f'{group}_weekly_wage'] * df[f'college_col_{col_suffix}']
  df[f'{group}_income_college_ratio'] = df[f'{group}_weekly_wage'] / (df[f'college_col_{col_suffix}'] + 1e-6)
  df[f'{group}_income_edu_interaction'] = df[f'{group}_weekly_wage'] * df[f'college_col_{col_suffix}']
  df['neonatal_to_postneonatal_ratio'] = df['neonatal_deaths_all_mothers'] / (df['postneonatal_deaths_all_mothers'] + 1e-6)
  df['maternal_mortality

In [883]:
print(df.columns.tolist())

['year', 'all_infant_deaths_all_mothers', 'all_infant_deaths_all_races__hispanic', 'all_infant_deaths_all_races__hispanic__central_and_south_american', 'all_infant_deaths_all_races__hispanic__cuban', 'all_infant_deaths_all_races__hispanic__mexican', 'all_infant_deaths_all_races__hispanic__other', 'all_infant_deaths_all_races__hispanic__puerto_rican', 'all_infant_deaths_american_indian_and_alaska_native', 'all_infant_deaths_american_indian_and_alaska_native__non-hispanic', 'all_infant_deaths_american_indian_and_alaska_native_only', 'all_infant_deaths_american_indian_and_alaska_native_only__non-hispanic', 'all_infant_deaths_asian_only__non-hispanic', 'all_infant_deaths_asian_or_pacific_islander', 'all_infant_deaths_asian_or_pacific_islander__non-hispanic', 'all_infant_deaths_black', 'all_infant_deaths_black__non-hispanic', 'all_infant_deaths_black_only', 'all_infant_deaths_black_only__non-hispanic', 'all_infant_deaths_native_hawaiian_or_other_pacific_islander_only__non-hispanic', 'all_in

In [884]:
## scaling

In [885]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset

# Define target variables and features
features = [
    'white_income_edu_interaction', 'black_income_edu_interaction', 'asian_income_edu_interaction', 'hispanic_income_edu_interaction',
    'white_income_college_ratio', 'black_income_college_ratio', 'asian_income_college_ratio', 'hispanic_income_college_ratio',
    'chronic_risk_index', 'income_inequality', 'black_white_infant_mortality_ratio', 'white_black_wage_gap'
]

# Maternal health model
target_maternal = 'maternal_mortality_per_birth'
# Child health model
target_child = 'total_infant_mortality'

# Train-test split
X_train, X_test, y_train_maternal, y_test_maternal = train_test_split(df[features], df[target_maternal], test_size=0.2, random_state=42)
X_train, X_test, y_train_child, y_test_child = train_test_split(df[features], df[target_child], test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Linear Regression
lr = LinearRegression()

# Child Model
lr.fit(X_train_scaled, y_train_child)
pred_child = lr.predict(X_test_scaled)
print("\nChild Health Model")
print("Linear Regression RMSE:", np.sqrt(mean_squared_error(y_test_child, pred_child)))
print("R^2:", r2_score(y_test_child, pred_child))

# Gradient Boosted Regressor
gbr = GradientBoostingRegressor(random_state=42)


# Child Model
gbr.fit(X_train, y_train_child)
pred_child_gbr = gbr.predict(X_test)
print("\nChild Health Model - Gradient Boosted Regressor")
print("RMSE:", np.sqrt(mean_squared_error(y_test_child, pred_child_gbr)))
print("R^2:", r2_score(y_test_child, pred_child_gbr))

# Feature Importance from Gradient Boosting
importance = gbr.feature_importances_
for i, feature in enumerate(features):
    print(f"Feature: {feature}, Importance: {importance[i]}")



Child Health Model
Linear Regression RMSE: 2.0077810725084944
R^2: 0.5666557554288237

Child Health Model - Gradient Boosted Regressor
RMSE: 3.6062149898339353
R^2: -0.3979883421556565
Feature: white_income_edu_interaction, Importance: 0.28088505984431594
Feature: black_income_edu_interaction, Importance: 0.10490496362588016
Feature: asian_income_edu_interaction, Importance: 0.01851392365718559
Feature: hispanic_income_edu_interaction, Importance: 0.1077051154189464
Feature: white_income_college_ratio, Importance: 0.2125480816452095
Feature: black_income_college_ratio, Importance: 0.1460052698781651
Feature: asian_income_college_ratio, Importance: 0.020519280149627058
Feature: hispanic_income_college_ratio, Importance: 0.011605593463292652
Feature: chronic_risk_index, Importance: 0.02395918129347201
Feature: income_inequality, Importance: 0.03688975504141511
Feature: black_white_infant_mortality_ratio, Importance: 0.03646377598249048
Feature: white_black_wage_gap, Importance: 0.0


In [886]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset

# Define specific mappings for racial groups based on the data dictionary
racial_groups = {
    "white": "all_infant_deaths_white",
    "black": "all_infant_deaths_black",
    "asian": "all_infant_deaths_asian_or_pacific_islander",
    "hispanic": "all_infant_deaths_all_races__hispanic"
}

results = {}

for group, target in racial_groups.items():
    print(f"\nModeling for {group.title()} Infant Mortality")

    # Define features specific to the group
    if group == "asian":
        income_col = f"{group}_income_college_ratio"
        edu_interaction = f"{group}_income_edu_interaction"
    elif group == "hispanic":
        income_col = "hispanic_income_college_ratio"
        edu_interaction = "hispanic_income_edu_interaction"
    else:
        income_col = f"{group}_income_college_ratio"
        edu_interaction = f"{group}_income_edu_interaction"

    features = [
        edu_interaction, income_col, "income_inequality",
        "chronic_risk_index", "black_white_infant_mortality_ratio", "white_black_wage_gap"
    ]

    # Ensure target and features exist
    if target not in df.columns or any(feature not in df.columns for feature in features):
        print(f"Skipping {group} due to missing data.")
        continue

    # Drop rows with missing values in features or target
    data = df[features + [target]].dropna()

    # Split the data
    X = data[features]
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Gradient Boosted Regressor
    gbr = GradientBoostingRegressor(random_state=42)
    gbr.fit(X_train, y_train)

    # Make predictions
    y_pred = gbr.predict(X_test)

    # Evaluate the model
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"RMSE: {rmse:.4f}")
    print(f"R^2: {r2:.4f}")

    # Feature Importance
    importance = gbr.feature_importances_
    feature_importance = dict(zip(features, importance))
    sorted_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

    print("Feature Importance:")
    for feature, importance in sorted_importance:
        print(f"  {feature}: {importance:.4f}")

    # Save results
    results[group] = {
        "rmse": rmse,
        "r2": r2,
        "feature_importance": sorted_importance
    }

# Summarize results
print("\nSummary of Results:")
for group, result in results.items():
    print(f"\n{group.title()} Infant Mortality")
    print(f"  RMSE: {result['rmse']:.4f}")
    print(f"  R^2: {result['r2']:.4f}")
    print(f"  Feature Importance:")
    for feature, importance in result['feature_importance']:
        print(f"    {feature}: {importance:.4f}")


Modeling for White Infant Mortality
RMSE: 0.0707
R^2: -0.9998
Feature Importance:
  white_income_edu_interaction: 0.4306
  black_white_infant_mortality_ratio: 0.2580
  income_inequality: 0.2039
  white_income_college_ratio: 0.1075
  chronic_risk_index: 0.0000
  white_black_wage_gap: 0.0000

Modeling for Black Infant Mortality
RMSE: 0.0707
R^2: 0.0000
Feature Importance:
  black_income_edu_interaction: 0.4306
  black_white_infant_mortality_ratio: 0.2580
  income_inequality: 0.2039
  black_income_college_ratio: 0.1075
  chronic_risk_index: 0.0000
  white_black_wage_gap: 0.0000

Modeling for Asian Infant Mortality
RMSE: 0.0707
R^2: 0.8750
Feature Importance:
  asian_income_edu_interaction: 0.4306
  black_white_infant_mortality_ratio: 0.2580
  income_inequality: 0.2039
  asian_income_college_ratio: 0.1075
  chronic_risk_index: 0.0000
  white_black_wage_gap: 0.0000

Modeling for Hispanic Infant Mortality
RMSE: 0.1424
R^2: -7.1078
Feature Importance:
  hispanic_income_edu_interaction: 0.607