# Midterm Activity 3 Logistic Regression

### Binary Logistic Regression Model Development

#### Data Preparation and Exploration
Import necessary libraries, load the dataset, and perform initial checks

In [107]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv('heart_disease_prediction.csv')

print("Missing Values Before Handling:")
print(df.isnull().sum(), "\n")

# Show basic info
print("\n",df.info())
print(df.head(), "\n")

# Check the distribution of the target variable
print("Target variable distribution:")
print(df['tenYearCHD'].value_counts())

# Compute Degrees of Freedom
def compute_dof(df):
    return df.shape[0] - 1
print("\nDegrees of Freedom:", compute_dof(df))

# Descriptive statistics
print("\nDescriptive Statistics:")
df.describe()

Missing Values Before Handling:
gender               0
age                  0
educationLevel     105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
tenYearCHD           0
dtype: int64 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   gender           4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   educationLevel   4133 non-null   float64
 3   currentSmoker    4238 non-null   int64  
 4   cigsPerDay       4209 non-null   float64
 5   BPMeds           4185 non-null   float64
 6   prevalentStroke  4238 non-null   int64  
 7   prevalentHyp     4238 non-null   int64  
 8   diabetes     

Unnamed: 0,gender,age,educationLevel,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,tenYearCHD
count,4238.0,4238.0,4133.0,4238.0,4209.0,4185.0,4238.0,4238.0,4238.0,4188.0,4238.0,4238.0,4219.0,4237.0,3850.0,4238.0
mean,0.429212,49.584946,1.97895,0.494101,9.003089,0.02963,0.005899,0.310524,0.02572,236.721585,132.352407,82.893464,25.802008,75.878924,81.966753,0.151958
std,0.495022,8.57216,1.019791,0.500024,11.920094,0.169584,0.076587,0.462763,0.158316,44.590334,22.038097,11.91085,4.080111,12.026596,23.959998,0.359023
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.07,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,144.0,89.875,28.04,83.0,87.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


#### Data Preprocessing

In [108]:
# Handle missing values
for col in df.columns:
    if df[col].dtype in ['int64', 'float64']:
        df[col] = df[col].fillna(df[col].median())

# Handle outliers using IQR
def handle_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] > upper_bound, upper_bound,
                      np.where(df[col] < lower_bound, lower_bound, df[col]))

# Apply to numerical columns
num_cols = ['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
for col in num_cols:
    handle_outliers(df, col)

# Feature selection using p-values
X = df.drop('tenYearCHD', axis=1)
y = df['tenYearCHD']
X_const = add_constant(X)
model = sm.Logit(y, X_const).fit()
print(model.summary())

# Keep features with p < 0.05
significant_features = model.pvalues[model.pvalues < 0.05].index.tolist()
if 'const' in significant_features:
    significant_features.remove('const')
X = X[significant_features]

# Check class imbalance
print("\nClass Distribution:")
print(y.value_counts(normalize=True))

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print("\nClass Distribution After SMOTE:")
print(y_resampled.value_counts(normalize=True))

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)
X_scaled = scaler.transform(X_resampled)

Optimization terminated successfully.
         Current function value: 0.380106
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:             tenYearCHD   No. Observations:                 4238
Model:                          Logit   Df Residuals:                     4222
Method:                           MLE   Df Model:                           15
Date:                Fri, 20 Jun 2025   Pseudo R-squ.:                  0.1079
Time:                        22:22:14   Log-Likelihood:                -1610.9
converged:                       True   LL-Null:                       -1805.8
Covariance Type:            nonrobust   LLR p-value:                 9.731e-74
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -8.0138      0.730    -10.981      0.000      -9.444      -6.583
gender        

#### Model Development and Evaluation

In [110]:
# Data Splitting
unseen, seen = train_test_split(df, test_size=0.9, random_state=42)
X_seen = seen[significant_features]
y_seen = seen['tenYearCHD']
X_train, X_test, y_train, y_test = train_test_split(X_seen, y_seen, test_size=0.2, random_state=42)


model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


# Model Coefficients
print("\nModel Coefficients:")
for feature, coef in zip(significant_features, model.coef_[0]):
    print(f"{feature}: {coef:.4f}")
print(f"Intercept: {model.intercept_[0]:.4f}")

              precision    recall  f1-score   support

           0       0.85      0.99      0.92       640
           1       0.71      0.10      0.17       123

    accuracy                           0.85       763
   macro avg       0.78      0.54      0.54       763
weighted avg       0.83      0.85      0.80       763


Model Coefficients:
gender: 0.5136
age: 0.0633
cigsPerDay: 0.0186
prevalentStroke: 0.7992
diabetes: 0.6998
sysBP: 0.0190
Intercept: -8.0232
