In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, precision_score, recall_score, f1_score, accuracy_score

In [2]:
df = pd.read_csv("death.csv")
df.head()

Unnamed: 0,county,fips,met_objective_of_45_5_1,age_adjusted_death_rate,lower_95_confidence_interval_for_death_rate,upper_95_confidence_interval_for_death_rate,average_deaths_per_year,recent_trend_2,recent_5_year_trend_2_in_death_rates,lower_95_confidence_interval_for_trend,upper_95_confidence_interval_for_trend
0,United States,0,No,46.0,45.9,46.1,157376,falling,-2.4,-2.6,-2.2
1,"Perry County, Kentucky",21193,No,125.6,108.9,144.2,43,stable,-0.6,-2.7,1.6
2,"Powell County, Kentucky",21197,No,125.3,100.2,155.1,18,stable,1.7,0,3.4
3,"North Slope Borough, Alaska",2185,No,124.9,73.0,194.7,5,**,**,**,**
4,"Owsley County, Kentucky",21189,No,118.5,83.1,165.5,8,stable,2.2,-0.4,4.8


In [3]:
df.replace(["**", "*"], np.nan, inplace=True)
df.dropna(inplace=True)

df['met_objective_of_45_5_1'] = LabelEncoder().fit_transform(df['met_objective_of_45_5_1'])
df['recent_trend_2'] = LabelEncoder().fit_transform(df['recent_trend_2'])

numeric_columns = ["age_adjusted_death_rate", "lower_95_confidence_interval_for_death_rate", "upper_95_confidence_interval_for_death_rate", "average_deaths_per_year", "recent_5_year_trend_2_in_death_rates", "lower_95_confidence_interval_for_trend", "upper_95_confidence_interval_for_trend"]
for col in numeric_columns:
    df[col] = df[col].astype(str).str.replace(',', '').astype(float)

X = df.drop(columns=["age_adjusted_death_rate", "county", "fips"])
y = df["age_adjusted_death_rate"]

In [4]:
print("Missing Values:")
print(df.isnull().sum())

Missing Values:
county                                         0
fips                                           0
met_objective_of_45_5_1                        0
age_adjusted_death_rate                        0
lower_95_confidence_interval_for_death_rate    0
upper_95_confidence_interval_for_death_rate    0
average_deaths_per_year                        0
recent_trend_2                                 0
recent_5_year_trend_2_in_death_rates           0
lower_95_confidence_interval_for_trend         0
upper_95_confidence_interval_for_trend         0
dtype: int64


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
models = {
    "LinearRegression": (LinearRegression(), {}),
    "Ridge": (Ridge(), {"alpha": [0.1, 1.0, 10.0]}),
    "Lasso": (Lasso(), {"alpha": [0.01, 0.1, 1.0]}),
    "DecisionTree": (DecisionTreeRegressor(), {"max_depth": [3, 5, 10]}),
    "RandomForest": (RandomForestRegressor(), {"n_estimators": [50, 100, 200], "max_depth": [5, 10]}),
    "SVR": (SVR(), {"C": [0.1, 1, 10], "epsilon": [0.01, 0.1, 1.0]}),
    "GradientBoosting": (GradientBoostingRegressor(), {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.2]}),
    "KNN": (KNeighborsRegressor(), {"n_neighbors": [3, 5, 7]})
}

In [9]:
for name, (model, params) in models.items():
    grid = GridSearchCV(model, params, scoring='neg_mean_squared_error', cv=5) if params else model
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_ if params else grid
    
    y_pred = best_model.predict(X_test)
    
    y_pred_class = np.where(y_pred > np.median(y_train), 1, 0)
    y_test_class = np.where(y_test > np.median(y_train), 1, 0)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    precision = precision_score(y_test_class, y_pred_class, zero_division=0)
    recall = recall_score(y_test_class, y_pred_class, zero_division=0)
    f1 = f1_score(y_test_class, y_pred_class, zero_division=0)
    accuracy = accuracy_score(y_test_class, y_pred_class)
    
    print(f"Model: {name}")
    print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, Accuracy: {accuracy:.4f}")
    print("-" * 40)

Model: LinearRegression
MSE: 0.1166, MAE: 0.2560, RMSE: 0.3415, R2: 0.9993
Precision: 0.9893, Recall: 0.9964, F1 Score: 0.9928, Accuracy: 0.9926
----------------------------------------
Model: Ridge
MSE: 0.1200, MAE: 0.2570, RMSE: 0.3464, R2: 0.9993
Precision: 0.9858, Recall: 0.9964, F1 Score: 0.9911, Accuracy: 0.9907
----------------------------------------
Model: Lasso
MSE: 0.1181, MAE: 0.2577, RMSE: 0.3437, R2: 0.9993
Precision: 0.9893, Recall: 0.9964, F1 Score: 0.9928, Accuracy: 0.9926
----------------------------------------
Model: DecisionTree
MSE: 0.8363, MAE: 0.6301, RMSE: 0.9145, R2: 0.9949
Precision: 0.9819, Recall: 0.9784, F1 Score: 0.9802, Accuracy: 0.9796
----------------------------------------
Model: RandomForest
MSE: 0.2273, MAE: 0.2644, RMSE: 0.4768, R2: 0.9986
Precision: 0.9964, Recall: 0.9964, F1 Score: 0.9964, Accuracy: 0.9963
----------------------------------------
Model: SVR
MSE: 0.8038, MAE: 0.1726, RMSE: 0.8966, R2: 0.9951
Precision: 0.9964, Recall: 1.0000, F1 