In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import pickle

In [2]:
# Load the dataset
path = "../EDA/COPD_feature_engineered.csv"
data = pd.read_csv(path)
data.head()

Unnamed: 0,Diseases & Risk Factors,Country Name,Gender,Numeric,Confidence_Range,Year_Bin,Numeric_log,Gender_Disease,Numeric_scaled
0,Noncommunicable diseases,Nepal,Males,19.28516,16.59685,1990s,3.00989,Males_Noncommunicable diseases,-0.340427
1,Noncommunicable diseases,Nepal,Females,19.23258,17.56212,1990s,3.007294,Females_Noncommunicable diseases,-0.340432
2,Noncommunicable diseases,Nepal,Males,18.83932,16.34251,2000s,2.987666,Males_Noncommunicable diseases,-0.340469
3,Noncommunicable diseases,Nepal,Females,18.76374,17.01147,2000s,2.983849,Females_Noncommunicable diseases,-0.340476
4,Noncommunicable diseases,Nepal,Males,17.34758,15.46373,2000s,2.909498,Males_Noncommunicable diseases,-0.340607


In [3]:
# Drop irrelevant columns only if they exist
columns_to_drop = ['Confidence_Range', 'Gender_Disease', 'Numeric_log', 'Numeric_scaled']
data = data.drop(columns=[col for col in columns_to_drop if col in data.columns])

In [4]:
# Encode categorical variables (including 'Country Name')
data = pd.get_dummies(data, columns=['Diseases & Risk Factors', 'Gender', 'Year_Bin', 'Country Name'], drop_first=True)

In [5]:
# Separate features and target variable
X = data.drop(columns=['Numeric'])
y = data['Numeric']

In [6]:
# Check for non-numeric data in X
print(X.dtypes)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Diseases & Risk Factors_Cancer                          bool
Diseases & Risk Factors_Cardiovascular diseases         bool
Diseases & Risk Factors_Chronic respiratory diseases    bool
Diseases & Risk Factors_Diabetes                        bool
Diseases & Risk Factors_Harmful Alcohol Use             bool
Diseases & Risk Factors_Noncommunicable diseases        bool
Diseases & Risk Factors_Obesity/unhealthy diet          bool
Diseases & Risk Factors_Physical inactivity             bool
Diseases & Risk Factors_Tobacco Use                     bool
Gender_Males                                            bool
Year_Bin_2000s                                          bool
Year_Bin_2010s                                          bool
dtype: object


In [7]:
# Define models to evaluate
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Linear Regression': LinearRegression(),
    'Support Vector Regression': SVR(),
    'Gradient Boosting': GradientBoostingRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{name} Mean Squared Error: {mse}")

    # Save the models
    with open(f'{name.replace(" ", "_")}.pkl', 'wb') as f:
        pickle.dump(model, f)
    print(f"{name} model trained and saved.")
print("Model training completed")

Random Forest Mean Squared Error: 85993154.05609915
Random Forest model trained and saved.
Linear Regression Mean Squared Error: 76967607.43116347
Linear Regression model trained and saved.
Support Vector Regression Mean Squared Error: 107237484.10158174
Support Vector Regression model trained and saved.
Gradient Boosting Mean Squared Error: 80659765.35430664
Gradient Boosting model trained and saved.
Model training completed
