In [1]:
# Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# Load dataset
data = pd.read_csv('framingham.csv')

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB


In [3]:
data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
# Check for missing values
data.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [None]:
# Replace missiing values with mean values
data.fillna(data.mean(), inplace=True)
# Define the outcomes
outcomes = ['TenYearCHD', 'prevalentStroke', 'prevalentHyp', 'diabetes']
# Save the best prediction models in a dictionary
best_models = {}

In [2]:
# Iterate over list of outcomes
for outcome in outcomes:
    print(f"Predicting {outcome}...")
    # Split the X features and Y targets
    X = data.drop(columns=[outcome])
    y = data[outcome]
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Define a grid of hyperparameters to search over during hyperparameter tuning.
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    # Perform hyperparameter tuning using grid search cross-validation
    # Search for the best combination of hyperparameters from the parameter grid and evaluate models based on accuracy.

    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')

    grid_search.fit(X_train, y_train)
        # Select and evaluate the best model
    best_estimator = grid_search.best_estimator_
    
    y_pred_best = best_estimator.predict(X_test)

    accuracy_best = accuracy_score(y_test, y_pred_best)
    classification_rep_best = classification_report(y_test, y_pred_best)
    # Store the model
    best_models[outcome] = best_estimator
    # Print results
    print("Best Accuracy:", accuracy_best)
    print("Tuned Classification Report:\n", classification_rep_best)

Predicting TenYearCHD...
Best Accuracy: 0.8502358490566038
Tuned Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.99      0.92       725
           1       0.38      0.05      0.09       123

    accuracy                           0.85       848
   macro avg       0.62      0.52      0.50       848
weighted avg       0.79      0.85      0.80       848

Predicting prevalentStroke...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Accuracy: 0.9929245283018868
Tuned Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       842
           1       0.00      0.00      0.00         6

    accuracy                           0.99       848
   macro avg       0.50      0.50      0.50       848
weighted avg       0.99      0.99      0.99       848

Predicting prevalentHyp...
Best Accuracy: 0.902122641509434
Tuned Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.91      0.93       595
           1       0.81      0.88      0.84       253

    accuracy                           0.90       848
   macro avg       0.88      0.90      0.89       848
weighted avg       0.91      0.90      0.90       848

Predicting diabetes...
Best Accuracy: 0.9893867924528302
Tuned Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       

In [3]:
from joblib import dump, load
# Iterate over the best_models dictionary
for outcome, model in best_models.items():
    # Save the model in a .joblib file
    model_filename = f"best_model_{outcome}.joblib"
    dump(model, model_filename)
    print(f"Best model for {outcome} saved as {model_filename}")
# Load model
loaded_model = load("best_model_TenYearCHD.joblib")
# Sample data for making predictions
new_data = pd.DataFrame({
    'male': [1], 'age': [45], 'education': [1], 'currentSmoker': [1], 'cigsPerDay': [20],
    'BPMeds': [0], 'prevalentStroke': [0], 'prevalentHyp': [0], 'diabetes': [0],
    'totChol': [210], 'sysBP': [120], 'diaBP': [80], 'BMI': [25], 'heartRate': [75],
    'glucose': [80]
})
# Making a prediction
predictions = loaded_model.predict(new_data)
# Displaying the predictions
print(predictions)


Best model for TenYearCHD saved as best_model_TenYearCHD.joblib
Best model for prevalentStroke saved as best_model_prevalentStroke.joblib
Best model for prevalentHyp saved as best_model_prevalentHyp.joblib
Best model for diabetes saved as best_model_diabetes.joblib
[0]
