In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the feature-engineered dataset
df = pd.read_csv("../EDA/COPD_feature_engineered.csv")

In [3]:
df.columns

Index(['Age', 'Gender', 'Income Level', 'Biomass Fuel', 'Tobacco Smoking',
       'Outdoor Air Poll', 'Occupational Expos', 'Family History',
       'Respiratory Infection', 'Health Insurance', 'COPD Diagnosis',
       'COPD Severity', 'Age_Smoking_Interaction',
       'Income_Biomass_Interaction', 'Location_Urban',
       'Education Level_Primary', 'Education Level_Secondary',
       'Occupation_Farmer', 'Occupation_Office', 'Occupation_Other',
       'Age Group_50-59', 'Age Group_60-69', 'Age Group_70-79'],
      dtype='object')

In [4]:
df_encoded = pd.get_dummies(df, drop_first=True)

In [5]:
df_encoded.columns

Index(['Age', 'Biomass Fuel', 'Tobacco Smoking', 'Outdoor Air Poll',
       'Occupational Expos', 'Family History', 'Respiratory Infection',
       'Age_Smoking_Interaction', 'Income_Biomass_Interaction',
       'Location_Urban', 'Education Level_Primary',
       'Education Level_Secondary', 'Occupation_Farmer', 'Occupation_Office',
       'Occupation_Other', 'Age Group_50-59', 'Age Group_60-69',
       'Age Group_70-79', 'Gender_Male', 'Income Level_Low',
       'Income Level_Middle', 'Health Insurance_Yes',
       'COPD Diagnosis_Undiagnosed', 'COPD Severity_Moderate',
       'COPD Severity_Severe'],
      dtype='object')

In [6]:
# Assuming df_encoded is your one-hot encoded DataFrame
X = df_encoded.drop(['COPD Diagnosis_Undiagnosed', 'COPD Severity_Moderate', 'COPD Severity_Severe'], axis=1)
y = df_encoded['COPD Diagnosis_Undiagnosed']


In [7]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Fit the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)

Accuracy: 0.61
Classification Report:
               precision    recall  f1-score   support

       False       0.00      0.00      0.00      3939
        True       0.61      1.00      0.75      6061

    accuracy                           0.61     10000
   macro avg       0.30      0.50      0.38     10000
weighted avg       0.37      0.61      0.46     10000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
from sklearn.model_selection import GridSearchCV

# Define the model and parameters
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000, 2000, 3000]
}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)

# Fit grid search
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters:", grid_search.best_params_)

Best parameters: {'C': 0.01, 'max_iter': 1000, 'solver': 'liblinear'}


In [10]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and fit the model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.2f}")

Random Forest Accuracy: 0.56


In [11]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X, y, cv=5)
print(f"Cross-Validation Accuracy: {cv_scores.mean():.2f} +/- {cv_scores.std():.2f}")

Cross-Validation Accuracy: 0.60 +/- 0.00


In [12]:
import pickle

# Save the Logistic Regression model
with open('logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the Random Forest model
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

print("Models saved successfully!")

Models saved successfully!
