In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('framingham.csv')
print(df.columns)
print(df.dtypes)


Index(['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')
male                 int64
age                  int64
education          float64
currentSmoker        int64
cigsPerDay         float64
BPMeds             float64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol            float64
sysBP              float64
diaBP              float64
BMI                float64
heartRate          float64
glucose            float64
TenYearCHD           int64
dtype: object


In [3]:
df.isnull().sum()
df.fillna(df.median(),inplace=True)
df.isnull().sum()

male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
X = df.drop('TenYearCHD', axis = 1)
y = df['TenYearCHD']

In [6]:

X_train,X_test,y_train,y_test = train_test_split(X,y,
                                    test_size = 0.20, random_state=42, stratify=y)
y_train

1767    0
2947    0
888     0
2026    0
2696    0
       ..
3518    0
3040    0
2561    0
3824    0
2273    0
Name: TenYearCHD, Length: 3390, dtype: int64

In [8]:
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(X_train)

[[-0.87237547  0.16042193  0.02064907 ... -0.24406401 -0.49272528
   0.06054382]
 [-0.87237547  1.09443712  0.02064907 ...  0.16146534  0.17311306
   0.2779168 ]
 [ 1.14629541  0.86093332 -0.96527151 ...  0.52571325 -1.07533383
  -0.54810051]
 ...
 [-0.87237547 -0.54008946  1.99249024 ... -0.35576671  0.17311306
   0.10401842]
 [-0.87237547 -1.24060085  1.00656966 ... -0.59617033 -0.99210403
   0.10401842]
 [-0.87237547 -0.54008946 -0.96527151 ...  0.54513981 -0.15980611
  -0.15682915]]


In [9]:
model = LogisticRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.86      0.99      0.92       719
           1       0.50      0.07      0.12       129

    accuracy                           0.85       848
   macro avg       0.68      0.53      0.52       848
weighted avg       0.80      0.85      0.80       848



In [14]:
import statsmodels.api as sm
import numpy as np
X_train_sm = sm.add_constant(X_train)

logit_model = sm.Logit(y_train, X_train_sm).fit()

print(logit_model.summary())

llf = logit_model.llf  
llnull = logit_model.llnull  

mcfadden_r2 = 1 - (llf / llnull)
print(f"McFadden's R²: {mcfadden_r2:.4f}") 


Optimization terminated successfully.
         Current function value: 0.374228
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:             TenYearCHD   No. Observations:                 3390
Model:                          Logit   Df Residuals:                     3374
Method:                           MLE   Df Model:                           15
Date:                Thu, 27 Mar 2025   Pseudo R-squ.:                  0.1216
Time:                        21:15:25   Log-Likelihood:                -1268.6
converged:                       True   LL-Null:                       -1444.2
Covariance Type:            nonrobust   LLR p-value:                 1.205e-65
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.0012      0.060    -33.631      0.000      -2.118      -1.885
x1             0.2264      0.