# Importing the Necessary Libraries:

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Preparing the Dataframe:

In [6]:
df = pd.read_csv('../data/cleaned/combined_all_years_cleaned_final.csv')
df.head()

Unnamed: 0,County,State,mean_life_expectancy,disability_rate,total_population,age_60_and_over,poverty_rate,housing_cost_burden,bachelors_and_above_percentage,median_gross_rent,...,median_household_income,per_capita_income,high_school_only_percentage,median_home_value,age_18_and_under_percentage,Year,white_percentage,black_percentage,native_american_percentage,asian_percentage
0,Abbeville County,South Carolina,76.554614,19.7,25387.0,14.7,20.8,221.0,11.7,593.0,...,35067.0,17876.0,34.8,90300.0,10.2,2012,69.953126,28.140387,0.070902,0.059085
1,Acadia Parish,Louisiana,74.133702,17.9,61611.0,10.9,19.1,339.0,9.3,541.0,...,38686.0,19369.0,42.2,88900.0,9.9,2012,78.789502,18.152603,0.22074,0.274302
2,Accomack County,Virginia,75.798941,11.8,33454.0,17.5,20.2,199.0,17.3,741.0,...,40780.0,22909.0,39.3,153800.0,8.0,2012,67.513601,28.755904,0.334788,0.122556
3,Ada County,Idaho,80.699209,9.6,394961.0,9.3,12.2,4338.0,37.8,821.0,...,55499.0,27522.0,20.8,193300.0,9.2,2012,91.925532,1.02719,0.536002,2.593674
4,Adair County,Iowa,78.760544,13.7,7628.0,18.6,9.9,43.0,11.5,511.0,...,47872.0,25147.0,45.5,96700.0,7.1,2012,97.758259,0.144206,0.157315,0.603041


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24858 entries, 0 to 24857
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   County                          24858 non-null  object 
 1   State                           24858 non-null  object 
 2   mean_life_expectancy            24858 non-null  float64
 3   disability_rate                 24858 non-null  float64
 4   total_population                24858 non-null  float64
 5   age_60_and_over                 24858 non-null  float64
 6   poverty_rate                    24858 non-null  float64
 7   housing_cost_burden             24858 non-null  float64
 8   bachelors_and_above_percentage  24858 non-null  float64
 9   median_gross_rent               24858 non-null  float64
 10  age_15_to_44_percentage         24858 non-null  float64
 11  gini_index                      24858 non-null  float64
 12  less_than_9th_grade_percentage  

In [10]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mean_life_expectancy,24858.0,77.365326,2.560676,66.047153,75.614112,77.491757,79.068485,92.25386
disability_rate,24858.0,15.703242,4.407639,3.7,12.6,15.3,18.4,36.9
total_population,24858.0,101895.308231,324975.256363,607.0,11392.25,26136.5,68324.25,10105720.0
age_60_and_over,24858.0,18.872476,6.893613,2.0,13.6,17.4,23.7,65.4
poverty_rate,24858.0,16.226442,6.488762,2.3,11.6,15.4,19.7,55.5
housing_cost_burden,24858.0,1136.591319,4659.09271,0.0,69.0,196.0,610.0,163104.0
bachelors_and_above_percentage,24858.0,20.421945,9.404331,2.8,13.9,18.2,24.5,80.2
median_gross_rent,24858.0,716.43133,203.878841,99.0,587.0,667.0,791.0,2316.0
age_15_to_44_percentage,24858.0,63.070822,21.253285,16.1,37.7,76.8,79.9,98.5
gini_index,24858.0,0.442139,0.035156,0.3023,0.4181,0.4398,0.4638,0.707


In [17]:
df.columns

Index(['County', 'State', 'mean_life_expectancy', 'disability_rate',
       'total_population', 'age_60_and_over', 'poverty_rate',
       'housing_cost_burden', 'bachelors_and_above_percentage',
       'median_gross_rent', 'age_15_to_44_percentage', 'gini_index',
       'less_than_9th_grade_percentage', 'health_insurance_rate',
       'unemployment_rate', 'median_household_income', 'per_capita_income',
       'high_school_only_percentage', 'median_home_value',
       'age_18_and_under_percentage', 'Year', 'white_percentage',
       'black_percentage', 'native_american_percentage', 'asian_percentage'],
      dtype='object')

# Splitting Data Into Features (X) and Target (y):

In [21]:
X = df.drop(columns=['County', 'State', 'Year', 'mean_life_expectancy']) #Features 
y = df['mean_life_expectancy'] #Target

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (19886, 21)
Testing data shape: (4972, 21)


# Initializing and Training the Model:

In [42]:
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)

In [44]:
rf_model.fit(X_train, y_train)

# Making Predictions:

In [59]:
train_predictions = rf_model.predict(X_train)
test_predictions = rf_model.predict(X_test)

# Evaluate the Model:

In [62]:
train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
print('Train RMSE:', f'{train_rmse:.2f}')
print('Test RMSE:', f'{test_rmse:.2f}')

Train RMSE: 0.31
Test RMSE: 0.82


In [65]:
from scipy.stats import pearsonr

# Visualize Model Performance:

In [None]:
# Scatter plot for model performance
plt.figure(figsize=(5, 5))
plt.scatter(y_train, y_pred, s=10, c='C0', label=f'Train $R^2$={round(train_r_sq, 2)}')
plt.scatter(y_test, test_predictions, marker='x', s=10, c='orange', label=f'Test $R^2$={round(test_r_sq, 2)}')
p1 = max(max(train_predictions), max(y_train))
p2 = min(min(train_predictions), min(y_train))
plt.plot([p1, p2], [p1, p2], 'black', label='1:1 line', linewidth=1)
plt.xlabel('True Life Expectancy')
plt.ylabel('Predicted Life Expectancy')
plt.title("Performance Scatter Plot for Life Expectancy")
plt.legend()
plt.grid(axis='both', linewidth=0.15)
plt.show()