In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score

In [18]:
df = pd.read_csv("LifeExpCleaned_Final.csv")

In [19]:
def clean_data(df):
    # Replace all instances of "Developing" with "0" in column: 'status'
    df['status'] = df['status'].str.replace("Developing", "0", case=False, regex=False)
    # Replace all instances of "Developed" with "1" in column: 'status'
    df['status'] = df['status'].str.replace("Developed", "1", case=False, regex=False)
    # Change column type to int8 for column: 'status'
    df = df.astype({'status': 'int8'})
    return df

df_clean = clean_data(df.copy())
df_clean.head()

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_expenditure,measles,bmi,under-five_deaths,polio,diphtheria,hiv_aids,schooling
0,Afghanistan,2015,0,65.0,263.0,62,0.01,71.279624,1154,19.1,83,6.0,65.0,0.1,10.1
1,Afghanistan,2014,0,59.9,271.0,64,0.01,73.523582,492,18.6,86,58.0,62.0,0.1,10.0
2,Afghanistan,2013,0,59.9,268.0,66,0.01,73.219243,430,18.1,89,62.0,64.0,0.1,9.9
3,Afghanistan,2012,0,59.5,272.0,69,0.01,78.184215,2787,17.6,93,67.0,67.0,0.1,9.8
4,Afghanistan,2011,0,59.2,275.0,71,0.01,7.097109,3013,17.2,97,68.0,68.0,0.1,9.5


In [26]:
(df := df_clean)

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_expenditure,measles,bmi,under-five_deaths,polio,diphtheria,hiv_aids,schooling
0,Afghanistan,2015,0,65.0,263.0,62,0.01,71.279624,1154,19.1,83,6.0,65.0,0.1,10.1
1,Afghanistan,2014,0,59.9,271.0,64,0.01,73.523582,492,18.6,86,58.0,62.0,0.1,10.0
2,Afghanistan,2013,0,59.9,268.0,66,0.01,73.219243,430,18.1,89,62.0,64.0,0.1,9.9
3,Afghanistan,2012,0,59.5,272.0,69,0.01,78.184215,2787,17.6,93,67.0,67.0,0.1,9.8
4,Afghanistan,2011,0,59.2,275.0,71,0.01,7.097109,3013,17.2,97,68.0,68.0,0.1,9.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2522,Zimbabwe,2004,0,44.3,723.0,27,4.36,0.000000,31,27.1,42,67.0,65.0,33.6,9.2
2523,Zimbabwe,2003,0,44.5,715.0,26,4.06,0.000000,998,26.7,41,7.0,68.0,36.7,9.5
2524,Zimbabwe,2002,0,44.8,73.0,25,4.43,0.000000,304,26.3,40,73.0,71.0,39.8,10.0
2525,Zimbabwe,2001,0,45.3,686.0,25,1.72,0.000000,529,25.9,39,76.0,75.0,42.1,9.8


In [27]:
df = df.drop(columns=["country", "year"])

In [28]:
x = df.drop(columns=["life_expectancy"])
y = df["life_expectancy"]

In [29]:
xtrain,xtest,ytrain,ytest = train_test_split(
    x,y, test_size=0.2, random_state=42
)

In [30]:
rf = RandomForestRegressor(
    n_estimators=200,       
    max_depth=None,         
    random_state=42,        
    n_jobs=-1            
)

In [31]:
rf.fit(xtrain,ytrain)

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [32]:
ypred = rf.predict(xtest)

In [33]:
mse = mean_squared_error(ytest, ypred)
rmse = mse ** 0.5
r2 = r2_score(ytest, ypred)

print("Random Forest Results:")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

Random Forest Results:
RMSE: 1.94
R² Score: 0.96


In [37]:
df.columns

Index(['status', 'life_expectancy', 'adult_mortality', 'infant_deaths',
       'alcohol', 'percentage_expenditure', 'measles', 'bmi',
       'under-five_deaths', 'polio', 'diphtheria', 'hiv_aids', 'schooling'],
      dtype='object')

In [45]:
# Feature dictionary template for prediction
sample = {
    "status": 0,                # 0 = Developing, 1 = Developed
    "adult_mortality": 130,     # Example: deaths per 1000 adults
    "infant_deaths": 50,        # Number of infant deaths per 1000
    "alcohol": 0.08,             # Alcohol consumption (liters per capita)
    "percentage_expenditure": 2.9,  # Health expenditure (% of GDP)
    "measles": 84,              # Measles cases per 1000 population
    "bmi": 20.6,                # Average Body Mass Index
    "under-five_deaths": 58.5,    # Deaths under age 5 per 1000
    "polio": 61,                # % of children immunized for polio
    "diphtheria": 87,           # % of children immunized for diphtheria
    "hiv_aids": 0.04,            # Deaths due to HIV/AIDS (per 1000)
    "schooling": 5        # Average years of schooling
}

# Convert to DataFrame before prediction
import pandas as pd

X_new = pd.DataFrame([sample])   # single row input
y_pred = rf.predict(X_new)
print("Predicted Life Expectancy:", y_pred[0])


Predicted Life Expectancy: 67.44349999999999
