In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from google.colab import files
uploaded = files.upload()
data = pd.read_csv("CVD_cleaned.csv")

Saving CVD_cleaned.csv to CVD_cleaned.csv


In [2]:
# Handle Age_Category ranges (convert to numeric midpoints)
age_map = {
    '18-24': 21,
    '25-29': 27,
    '30-34': 32,
    '35-39': 37,
    '40-44': 42,
    '45-49': 47,
    '50-54': 52,
    '55-59': 57,
    '60-64': 62,
    '65-69': 67,
    '70-74': 72,
    '75-79': 77,
    '80': 80,
    '80+': 81
}

# Apply mapping
data['Age_Category'] = data['Age_Category'].replace(age_map)

# Ensure the column is numeric after replacement
data['Age_Category'] = pd.to_numeric(data['Age_Category'], errors='coerce')

## Encode categorical variables
from sklearn.preprocessing import LabelEncoder

binary_cols = ['General_Health', 'Exercise', 'Heart_Disease',
               'Skin_Cancer', 'Other_Cancer', 'Depression', 'Arthritis', 'Sex',
               'Smoking_History']
for col in binary_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

## One-hot encoding for Checkup and diabetes column
data = pd.get_dummies(
    data,
    columns=['Checkup', 'Diabetes'],
    drop_first=False,
    dtype=int
)

# Print Checkup one-hot columns only
print("\nCHECKUP COLUMNS")
print(data.filter(like="Checkup").head())

# Print Diabetes one-hot columns only
print("\nDIABETES COLUMNS")
print(data.filter(like="Diabetes").head())

# Print all other columns (exclude Checkup and Diabetes)
other_cols = data.drop(columns=data.filter(regex="Checkup|Diabetes").columns)

print("\nALL OTHER COLUMNS")
print(other_cols.head())

print("\nNew shape after encoding:", data.shape)

print("Preprocessing complete.")

  data['Age_Category'] = data['Age_Category'].replace(age_map)



CHECKUP COLUMNS
   Checkup_5 or more years ago  Checkup_Never  \
0                            0              0   
1                            0              0   
2                            0              0   
3                            0              0   
4                            0              0   

   Checkup_Within the past 2 years  Checkup_Within the past 5 years  \
0                                1                                0   
1                                0                                0   
2                                0                                0   
3                                0                                0   
4                                0                                0   

   Checkup_Within the past year  
0                             0  
1                             1  
2                             1  
3                             1  
4                             1  

DIABETES COLUMNS
   Diabetes_No  Diabetes_No, pre-diabet

In [3]:
#Step 3: Define features and target
X = data.drop(columns=['Heart_Disease'])
y = data['Heart_Disease']

##80/20 train-test split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=7, stratify=y
)

print("Training samples:", X_train.shape[0])
print("Validation samples:", X_val.shape[0])

#Step 5: Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

Training samples: 247083
Validation samples: 61771


In [4]:
## Initial model architecture
initial_model = LogisticRegression(
    random_state = 7,
    penalty = 'l2',
)

initial_model.fit(X_train_scaled, y_train)
y_pred_initial = initial_model.predict(X_val_scaled)

init_accuracy = accuracy_score(y_val, y_pred_initial)
init_precision = precision_score(y_val, y_pred_initial)
init_recall = recall_score(y_val, y_pred_initial)
init_f1 = f1_score(y_val, y_pred_initial)
init_cm = confusion_matrix(y_val, y_pred_initial)

print("\nInitial Logistic Regression Results")
print(f"Accuracy: {init_accuracy:.4f}")
print(f"Precision: {init_precision:.4f}")
print(f"Recall: {init_recall:.4f}")
print(f"F1 Score: {init_f1:.4f}")
print("Confusion Matrix:")
print(init_cm)


Initial Logistic Regression Results
Accuracy: 0.9188
Precision: 0.4623
Recall: 0.0282
F1 Score: 0.0532
Confusion Matrix:
[[56613   164]
 [ 4853   141]]


In [5]:
## Final Model Architecture
final_model = LogisticRegression(
    random_state = 7,
    penalty = 'l2',
    C = 1.0,
    class_weight = 'balanced'
)
final_model.fit(X_train_scaled, y_train)
y_prob = final_model.predict_proba(X_val_scaled)[:, 1]
y_pred_final = final_model.predict(X_val_scaled)

## threshold tuning
threshold = 0.40
y_pred_final = (y_prob >= threshold).astype(int)

final_accuracy = accuracy_score(y_val, y_pred_final)
final_precision = precision_score(y_val, y_pred_final)
final_recall = recall_score(y_val, y_pred_final)
final_f1 = f1_score(y_val, y_pred_final)
final_cm = confusion_matrix(y_val, y_pred_final)

print("\nFinal Logistic Regression Results")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"Precision: {final_precision:.4f}")
print(f"Recall: {final_recall:.4f}")
print(f"F1 Score: {final_f1:.4f}")
print("Confusion Matrix:")
print(final_cm)




Final Logistic Regression Results
Accuracy: 0.6259
Precision: 0.1622
Recall: 0.8704
F1 Score: 0.2734
Confusion Matrix:
[[34317 22460]
 [  647  4347]]
