In [1]:
# Import Library
import pandas as pd

In [2]:
# Read all the csv Data
data = pd.read_csv('./COPD_Data_Engineered.csv')

In [3]:
# Show the first few rows
data.head()

Unnamed: 0,Age,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Gender_Male,Smoking_Status_Former,...,Location_Kathmandu,Location_Lalitpur,Location_Nepalgunj,Location_Pokhara,Gender_Smoking,Air_Pollution_Level_log,BMI_log,Age_Bin_Adult,Age_Bin_Middle_Aged,Age_Bin_Elderly
0,-1.602656,1.012073,0.966559,0.992032,0.182927,-0.784912,-0.972381,0,True,True,...,False,True,False,False,False,-1.536706,0.167992,False,False,False
1,0.331228,1.012073,-1.034598,-1.008032,0.754353,-0.077974,1.028403,0,True,False,...,False,False,False,True,False,-0.081182,0.5621,False,False,False
2,-1.469285,-0.988071,-1.034598,0.992032,0.368536,-0.198304,1.028403,0,True,True,...,False,False,False,True,False,-0.221025,0.313742,False,False,False
3,-1.269228,1.012073,-1.034598,-1.008032,0.168329,1.757056,-0.972381,1,False,False,...,True,False,False,False,False,1.014164,0.155574,False,False,False
4,0.197856,-0.988071,-1.034598,-1.008032,-0.24877,-0.288551,1.028403,0,True,False,...,False,False,False,True,False,-0.340451,-0.286044,False,False,False


In [4]:
# find missing values
data.isnull().sum()

Age                                   0
Biomass_Fuel_Exposure                 0
Occupational_Exposure                 0
Family_History_COPD                   0
BMI                                   0
Air_Pollution_Level                   0
Respiratory_Infections_Childhood      0
COPD_Diagnosis                        0
Gender_Male                           0
Smoking_Status_Former                 0
Smoking_Status_Never                  0
Location_Biratnagar                   0
Location_Butwal                       0
Location_Chitwan                      0
Location_Dharan                       0
Location_Hetauda                      0
Location_Kathmandu                    0
Location_Lalitpur                     0
Location_Nepalgunj                    0
Location_Pokhara                      0
Gender_Smoking                        0
Air_Pollution_Level_log             149
BMI_log                             201
Age_Bin_Adult                         0
Age_Bin_Middle_Aged                   0


In [5]:
data_cleaned = data.dropna()

In [6]:
# Splitting the data into features (X) and target (y)
X = data_cleaned.drop(columns='COPD_Diagnosis')
y = data_cleaned['COPD_Diagnosis']

In [7]:
# Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle

In [9]:
# Innitialize the models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest' : RandomForestClassifier()
}

In [10]:
# Train the models
for name, model in models.items():
    model.fit(X_train, y_train)

    # Save teh models
    with open(f'{name.replace(" ", "_")}.pkl', 'wb') as f:
        pickle.dump(model, f)
    print(f"{name} model trained and saved.")

print("Model training completed")

Logistic Regression model trained and saved.
Decision Tree model trained and saved.
Random Forest model trained and saved.
Model training completed


In [11]:
# Evaluate the Models
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} Evaluation: ")
    print(classification_report(y_test, y_pred))


Logistic Regression Evaluation: 
              precision    recall  f1-score   support

           0       0.99      0.99      0.99        93
           1       0.98      0.98      0.98        43

    accuracy                           0.99       136
   macro avg       0.98      0.98      0.98       136
weighted avg       0.99      0.99      0.99       136


Decision Tree Evaluation: 
              precision    recall  f1-score   support

           0       0.99      0.99      0.99        93
           1       0.98      0.98      0.98        43

    accuracy                           0.99       136
   macro avg       0.98      0.98      0.98       136
weighted avg       0.99      0.99      0.99       136


Random Forest Evaluation: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        93
           1       1.00      1.00      1.00        43

    accuracy                           1.00       136
   macro avg       1.00      1.00     

# Model Refinement

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
# Define the parameter grid for Random forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

In [14]:
# Innitiliaze the GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid= param_grid, cv = 5, n_jobs = -1, scoring='accuracy')

In [15]:
# Fit the gridSeach cv
grid_search.fit(X_train, y_train)

In [16]:
# Best parameters
print(f"Best Parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}


In [17]:
# Save the best model
with open('Best_Random_Forest_Model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print("Model refinement completed and best model saved")

Model refinement completed and best model saved
