# Split the data into training and testing

In [65]:
import pandas as  pd
from sklearn.model_selection import train_test_split

In [66]:
path = r'/Users/roshanthapa/Desktop/Omdena-Capacity_Building/copd/COPD_Prediction/EDA/engineered_COPD_DATA.csv'
df = pd.read_csv(path)

In [67]:
df.head()

Unnamed: 0,Age,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Smoking_Status_Encoding,Gender_Encoding
0,31,1,1,1,27.56,84,0,0,0.5,1
1,60,1,0,0,30.3,131,1,0,0.0,1
2,33,0,0,1,28.45,123,1,0,0.5,1
3,36,1,0,0,27.49,253,0,1,1.0,0
4,58,0,0,0,25.49,117,1,0,0.0,1


In [68]:
# Define the features and the target variable
X = df.drop(columns=['COPD_Diagnosis'])
y = df['COPD_Diagnosis']

In [69]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training

- Based on the data, it is a binary classification problem set as we are predicting if one has COPD or not, hence we will use the following models:
    - Logistic Regression
    - Decision Trees
    - Random Forest

In [70]:
# Import the neccesaray libraries
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle

In [71]:
# Initialize the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

In [72]:
# Train the models
for name, model in models.items():
    model.fit(X_train, y_train)

    # Save the models
    with open(f'{name.replace(" ", "_")}.pkl', 'wb') as file:
        pickle.dump(model, file)
    print((f'{name} trained & saved successfully!'))

Logistic Regression trained & saved successfully!
Decision Tree trained & saved successfully!
Random Forest trained & saved successfully!


# Evaluate the models
- Accuracy, Precision, Recall, F1_Score

In [73]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f'\n{name} Evaluation:')
    print(classification_report(y_test, y_pred))
    print('\n')


Logistic Regression Evaluation:
              precision    recall  f1-score   support

           0       0.97      0.93      0.95       134
           1       0.86      0.94      0.90        66

    accuracy                           0.93       200
   macro avg       0.91      0.93      0.92       200
weighted avg       0.93      0.93      0.93       200




Decision Tree Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       134
           1       1.00      1.00      1.00        66

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200




Random Forest Evaluation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       134
           1       1.00      1.00      1.00        66

    accuracy                           1.00       200
   macro avg       1.00      1.00    

# Model Refinement

In [74]:
from sklearn.model_selection import GridSearchCV

In [75]:
# Define the parameters for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

In [76]:
# Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy') 

In [77]:
# Fit the GridSearhCV
grid_search.fit(X_train, y_train)

In [78]:
# Best parameters
print(f'Best parameters: {grid_search.best_params_}')

Best parameters: {'max_depth': None, 'n_estimators': 100}


In [79]:
best_model = grid_search.best_estimator_

In [80]:
with open('Best_Random_Forest.pkl', 'wb') as file:
    pickle.dump(best_model, file)
    
print('Best Random Forest model trained & saved successfully!')

Best Random Forest model trained & saved successfully!


In [81]:
df

Unnamed: 0,Age,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Smoking_Status_Encoding,Gender_Encoding
0,31,1,1,1,27.56,84,0,0,0.5,1
1,60,1,0,0,30.30,131,1,0,0.0,1
2,33,0,0,1,28.45,123,1,0,0.5,1
3,36,1,0,0,27.49,253,0,1,1.0,0
4,58,0,0,0,25.49,117,1,0,0.0,1
...,...,...,...,...,...,...,...,...,...,...
995,62,1,1,1,20.11,127,0,1,1.0,1
996,52,0,1,0,27.00,118,0,0,0.0,1
997,40,0,0,1,33.98,134,1,1,1.0,0
998,76,0,1,1,22.27,281,0,0,0.5,0
