# COMP-5011 Course Project

## Heart Failure Prediction Using Machine Learning: A Binary Classification Approach


**Sainzolboo Anujin** - 1311002

**Syed Abdul Rahman** – 1260544

**Tao Xue** - 1316845


In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, ttest_ind, mannwhitneyu
from scipy.stats import mannwhitneyu

In [32]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [49]:
# Basic info
print(f"\nTotal Records: {len(df)}")
print(f"Total Features: {len(df.columns)} (11 features + 1 target)")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")

# Missing values
print("Missing values")
missing_values = pd.DataFrame({
    'missing count': df.isnull().sum(),
    'percent': (df.isnull().sum() / len(df) * 100).round(2)
})
display(missing_values)

# Zero values
print("Zero values")
numerical_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
zero_values = pd.DataFrame({
    'Zero Count': (df[numerical_cols] == 0).sum(),
    'Percentage (%)': ((df[numerical_cols] == 0).sum() / len(df) * 100).round(2)
})
display(zero_values)

# Target variable
target_counts = df['HeartDisease'].value_counts().sort_index()
target_percent = (df['HeartDisease'].value_counts(normalize=True).sort_index() * 100).round(2)
target_df = pd.DataFrame({
    'Count': target_counts,
    'Percentage (%)': target_percent
})
target_df.index = ['No Heart Disease (0)', 'Heart Disease (1)']
display(target_df)
print(f"\nClass Imbalance Ratio: {target_counts[1]/target_counts[0]:.2f}:1 (Disease:Normal)")


Total Records: 918
Total Features: 12 (11 features + 1 target)
Memory Usage: 317.21 KB
Missing values


Unnamed: 0,missing count,percent
Age,0,0.0
Sex,0,0.0
ChestPainType,0,0.0
RestingBP,0,0.0
Cholesterol,0,0.0
FastingBS,0,0.0
RestingECG,0,0.0
MaxHR,0,0.0
ExerciseAngina,0,0.0
Oldpeak,0,0.0


Zero values


Unnamed: 0,Zero Count,Percentage (%)
Age,0,0.0
RestingBP,1,0.11
Cholesterol,172,18.74
MaxHR,0,0.0
Oldpeak,368,40.09


Unnamed: 0,Count,Percentage (%)
No Heart Disease (0),410,44.66
Heart Disease (1),508,55.34



Class Imbalance Ratio: 1.24:1 (Disease:Normal)


In [61]:
# Categorical values
categorical_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'FastingBS']

for feature in ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']:
    value_counts = df[feature].value_counts()
    value_percent = (df[feature].value_counts(normalize=True) * 100).round(2)
    feat_df = pd.DataFrame({
        'count': value_counts,
        'percent': value_percent
    })
    display(feat_df)

for feature in ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']:
    contingency_table = pd.crosstab(df[feature], df['HeartDisease'])
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    
    # Disease prevalence by category
    disease_percent = (contingency_table[1] / (contingency_table[0] + contingency_table[1]) * 100).round(2)
    
    print(f"\n{feature}:")
    print(f"Chi-square: {chi2:.4f}, p-value: {p_value:.4f} {'***' if p_value < 0.001 else '**' if p_value < 0.01 else '*' if p_value < 0.05 else 'ns'}")
    print(f"Disease prevalence by category:")
    for cat, percent in disease_percent.items():
        count_disease = contingency_table.loc[cat, 1]
        count_total = contingency_table.loc[cat].sum()
        print(f"    {cat}: {percent:.1f}% ({count_disease}/{count_total})")

Unnamed: 0_level_0,count,percent
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
M,725,78.98
F,193,21.02


Unnamed: 0_level_0,count,percent
ChestPainType,Unnamed: 1_level_1,Unnamed: 2_level_1
ASY,496,54.03
NAP,203,22.11
ATA,173,18.85
TA,46,5.01


Unnamed: 0_level_0,count,percent
RestingECG,Unnamed: 1_level_1,Unnamed: 2_level_1
Normal,552,60.13
LVH,188,20.48
ST,178,19.39


Unnamed: 0_level_0,count,percent
ExerciseAngina,Unnamed: 1_level_1,Unnamed: 2_level_1
N,547,59.59
Y,371,40.41


Unnamed: 0_level_0,count,percent
ST_Slope,Unnamed: 1_level_1,Unnamed: 2_level_1
Flat,460,50.11
Up,395,43.03
Down,63,6.86



Sex:
Chi-square: 84.1451, p-value: 0.0000 ***
Disease prevalence by category:
    F: 25.9% (50/193)
    M: 63.2% (458/725)

ChestPainType:
Chi-square: 268.0672, p-value: 0.0000 ***
Disease prevalence by category:
    ASY: 79.0% (392/496)
    ATA: 13.9% (24/173)
    NAP: 35.5% (72/203)
    TA: 43.5% (20/46)

RestingECG:
Chi-square: 10.9315, p-value: 0.0042 **
Disease prevalence by category:
    LVH: 56.4% (106/188)
    Normal: 51.6% (285/552)
    ST: 65.7% (117/178)

ExerciseAngina:
Chi-square: 222.2594, p-value: 0.0000 ***
Disease prevalence by category:
    N: 35.1% (192/547)
    Y: 85.2% (316/371)

ST_Slope:
Chi-square: 355.9184, p-value: 0.0000 ***
Disease prevalence by category:
    Down: 77.8% (49/63)
    Flat: 82.8% (381/460)
    Up: 19.8% (78/395)


In [None]:
# Handle Missing Data
chol_median = df.loc[df['Cholesterol'] != 0, 'Cholesterol'].median()
bp_median = df.loc[df['RestingBP'] != 0, 'RestingBP'].median()

df.loc[df['Cholesterol'] == 0, 'Cholesterol'] = chol_median
df.loc[df['RestingBP'] == 0, 'RestingBP'] = bp_median

df.to_csv('heart_cleaned.csv', index=False)
