Among Liner Regression, Random forest Models which regression approach provides the most accurate predictions
for heart disease mortality rate based on demograplic and geographic factors?

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

data_file = "Heart_Disease_Mortality_Data_Among_US_Adults__35___by_State_Territory_and_County___2015-2017_20250314.csv"
df = pd.read_csv(data_file, sep=",")

print("Preview before cleaning:")
display(df.head())


df["Data_Value"] = df["Data_Value"].astype(str).str.replace(",", "").astype(float)

print("\nPreview after removing commas in Data_Value:")
display(df.head())

print("\nData types after cleaning:")
print(df.dtypes)


Preview before cleaning:


Unnamed: 0,Year,LocationAbbr,LocationDesc,GeographicLevel,DataSource,Class,Topic,Data_Value,Data_Value_Unit,Data_Value_Type,...,Data_Value_Footnote,StratificationCategory1,Stratification1,StratificationCategory2,Stratification2,TopicID,LocationID,Y_lat,X_lon,Georeference Column
0,2016,PA,Delaware County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,295.7,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,,Sex,Female,Race/Ethnicity,Black,T2,42045,39.916467,-75.406639,POINT (-75.40663919 39.916466986)
1,2016,MN,Minnesota,State,NVSS,Cardiovascular Diseases,Heart Disease Mortality,295.7,"per 100,000 population","Age-adjusted, 3-year Average Rate",...,,Sex,Male,Race/Ethnicity,Overall,T2,27,46.3159,-94.3068,POINT (-94.3068 46.3159)
2,2016,OH,Ohio,State,NVSS,Cardiovascular Diseases,Heart Disease Mortality,288.5,"per 100,000 population","Age-adjusted, 3-year Average Rate",...,,Sex,Female,Race/Ethnicity,Overall,T2,39,40.2882,-82.7866,POINT (-82.7866 40.2882)
3,2016,TX,Bell County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,233.9,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,,Sex,Male,Race/Ethnicity,Hispanic,T2,48027,31.045598,-97.482273,POINT (-97.48227324 31.045597981000004)
4,2016,DE,Delaware,State,NVSS,Cardiovascular Diseases,Heart Disease Mortality,169.7,"per 100,000 population","Age-adjusted, 3-year Average Rate",...,,Sex,Male,Race/Ethnicity,Hispanic,T2,10,38.9952,-75.5014,POINT (-75.5014 38.9952)



Preview after removing commas in Data_Value:


Unnamed: 0,Year,LocationAbbr,LocationDesc,GeographicLevel,DataSource,Class,Topic,Data_Value,Data_Value_Unit,Data_Value_Type,...,Data_Value_Footnote,StratificationCategory1,Stratification1,StratificationCategory2,Stratification2,TopicID,LocationID,Y_lat,X_lon,Georeference Column
0,2016,PA,Delaware County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,295.7,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,,Sex,Female,Race/Ethnicity,Black,T2,42045,39.916467,-75.406639,POINT (-75.40663919 39.916466986)
1,2016,MN,Minnesota,State,NVSS,Cardiovascular Diseases,Heart Disease Mortality,295.7,"per 100,000 population","Age-adjusted, 3-year Average Rate",...,,Sex,Male,Race/Ethnicity,Overall,T2,27,46.3159,-94.3068,POINT (-94.3068 46.3159)
2,2016,OH,Ohio,State,NVSS,Cardiovascular Diseases,Heart Disease Mortality,288.5,"per 100,000 population","Age-adjusted, 3-year Average Rate",...,,Sex,Female,Race/Ethnicity,Overall,T2,39,40.2882,-82.7866,POINT (-82.7866 40.2882)
3,2016,TX,Bell County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,233.9,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,,Sex,Male,Race/Ethnicity,Hispanic,T2,48027,31.045598,-97.482273,POINT (-97.48227324 31.045597981000004)
4,2016,DE,Delaware,State,NVSS,Cardiovascular Diseases,Heart Disease Mortality,169.7,"per 100,000 population","Age-adjusted, 3-year Average Rate",...,,Sex,Male,Race/Ethnicity,Hispanic,T2,10,38.9952,-75.5014,POINT (-75.5014 38.9952)



Data types after cleaning:
Year                            int64
LocationAbbr                   object
LocationDesc                   object
GeographicLevel                object
DataSource                     object
Class                          object
Topic                          object
Data_Value                    float64
Data_Value_Unit                object
Data_Value_Type                object
Data_Value_Footnote_Symbol     object
Data_Value_Footnote            object
StratificationCategory1        object
Stratification1                object
StratificationCategory2        object
Stratification2                object
TopicID                        object
LocationID                      int64
Y_lat                         float64
X_lon                         float64
Georeference Column            object
dtype: object


In [4]:
target = "Data_Value"

features = [
    "Year",
    "LocationAbbr",
    "LocationDesc",
    "GeographicLevel",
    "DataSource",
    "Class",
    "Topic",
    "StratificationCategory1",
    "Stratification1",
    "StratificationCategory2",
    "Stratification2",
    "Y_lat",
    "X_lon"
]

df = df.dropna(subset=[target])

X = df[features].copy()
y = df[target].copy()

print(f"Number of rows after dropping missing '{target}': {len(X)}")

Number of rows after dropping missing 'Data_Value': 32230


In [5]:
numeric_cols = ["Year", "Y_lat", "X_lon"]
categorical_cols = [col for col in features if col not in numeric_cols]

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

Numeric columns: ['Year', 'Y_lat', 'X_lon']
Categorical columns: ['LocationAbbr', 'LocationDesc', 'GeographicLevel', 'DataSource', 'Class', 'Topic', 'StratificationCategory1', 'Stratification1', 'StratificationCategory2', 'Stratification2']


In [6]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean"))
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

In [7]:
pipeline_lr = Pipeline([
    ("preprocessing", preprocessor),
    ("regressor", LinearRegression())
])

pipeline_rf = Pipeline([
    ("preprocessing", preprocessor),
    ("regressor", RandomForestRegressor(random_state=42))
])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.30,
    random_state=42
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

Training set size: (22561, 13)
Test set size: (9669, 13)


In [9]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

cv_scores_lr = cross_val_score(
    pipeline_lr,
    X_train,
    y_train,
    cv=kf,
    scoring="neg_mean_squared_error"
)
mse_scores_lr = -cv_scores_lr
avg_mse_lr = np.mean(mse_scores_lr)

print("Linear Regression - CV MSE Scores:", mse_scores_lr)
print("Linear Regression - Average CV MSE:", avg_mse_lr)

cv_scores_rf = cross_val_score(
    pipeline_rf,
    X_train,
    y_train,
    cv=kf,
    scoring="neg_mean_squared_error"
)
mse_scores_rf = -cv_scores_rf
avg_mse_rf = np.mean(mse_scores_rf)

print("\nRandom Forest - CV MSE Scores:", mse_scores_rf)
print("Random Forest - Average CV MSE:", avg_mse_rf)

Linear Regression - CV MSE Scores: [ 6121.03393663  6249.24661115  6184.17401563  5260.59167255
  6191.14190008 11660.52525707  6865.70834153  5899.59987596
  6265.43099945  5805.75734106]
Linear Regression - Average CV MSE: 6650.32099511113

Random Forest - CV MSE Scores: [5280.02583651 4769.19218718 5412.54493091 3640.64996306 6427.54607329
 9394.66254286 5174.93326611 3718.53547662 4956.98324001 3966.92928251]
Random Forest - Average CV MSE: 5274.200279905574


In [10]:
pipeline_lr.fit(X_train, y_train)
y_pred_lr = pipeline_lr.predict(X_test)
test_mse_lr = mean_squared_error(y_test, y_pred_lr)

pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)
test_mse_rf = mean_squared_error(y_test, y_pred_rf)

print("\n===== Final Test Set MSE =====")
print("Linear Regression:", test_mse_lr)
print("Random Forest:", test_mse_rf)


===== Final Test Set MSE =====
Linear Regression: 5764.133264367884
Random Forest: 4038.3269779670663


In [12]:
all_mse_values = mse_scores_lr.tolist() + mse_scores_rf.tolist() + [test_mse_lr, test_mse_rf]

min_mse = min(all_mse_values)
max_mse = max(all_mse_values)

normalize = lambda x: (x - min_mse) / (max_mse - min_mse)

normalized_test_lr = normalize(test_mse_lr)
normalized_test_rf = normalize(test_mse_rf)

print("\n===== Normalized Test Set MSE (between 0 and 1) =====")
print(f"Linear Regression - Normalized Test MSE: {normalized_test_lr:.3f}")
print(f"Random Forest    - Normalized Test MSE: {normalized_test_rf:.3f}")



===== Normalized Test Set MSE (between 0 and 1) =====
Linear Regression - Normalized Test MSE: 0.265
Random Forest    - Normalized Test MSE: 0.050
