In [1]:
import altair as alt
from deepchecks.tabular.checks import FeatureLabelCorrelation, FeatureFeatureCorrelation
from deepchecks.tabular import Dataset
import numpy as np
import pandas as pd
import pandera as pa
import requests
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
import warnings
import zipfile

warnings.filterwarnings("ignore", category=FutureWarning, module="deepchecks")


pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.



In [2]:
colnames = [
    'patient_id', 
    'age', 
    'gender', 
    'chest_pain', 
    'resting_bp',
    'serum_cholesterol', 
    'fasting_blood_sugar', 
    'resting_electro',
    'max_heart_rate', 
    'exercise_angia', 
    'old_peak', 
    'slope', 
    'num_major_vessels',
    'target'
]

heart = pd.read_csv("../data/raw/Cardiovascular_Disease_Dataset/Cardiovascular_Disease_Dataset.csv", names=colnames, header=0)

In [3]:
# Change values of 1 and 0 to 'Heart Disease' and 'No Heart Disease' in target
heart['target'] = heart['target'].replace({
    1 : 'Heart Disease',
    0 : 'No Heart Disease'
})

# Create train test split
train_heart, test_heart = train_test_split(heart, test_size = 0.2,random_state=123)

## uncomment in final ipynb
# train_heart.to_csv("./data/processed/train_heart.csv") # changed heart_train to train_heart.csv for consistency
# test_heart.to_csv("./data/processed/test_heart.csv") # changed heart_test to test_heart.csv for consistency

In [4]:
heart

Unnamed: 0,patient_id,age,gender,chest_pain,resting_bp,serum_cholesterol,fasting_blood_sugar,resting_electro,max_heart_rate,exercise_angia,old_peak,slope,num_major_vessels,target
0,103368,53,1,2,171,0,0,1,147,0,5.3,3,3,Heart Disease
1,119250,40,1,0,94,229,0,1,115,0,3.7,1,1,No Heart Disease
2,119372,49,1,2,133,142,0,0,202,1,5.0,1,0,No Heart Disease
3,132514,43,1,0,138,295,1,1,153,0,3.2,2,2,Heart Disease
4,146211,31,1,1,199,0,0,2,136,0,5.3,3,2,Heart Disease
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,9949544,48,1,2,139,349,0,2,183,1,5.6,2,2,Heart Disease
996,9953423,47,1,3,143,258,1,1,98,1,5.7,1,0,No Heart Disease
997,9965859,69,1,0,156,434,1,0,196,0,1.4,3,1,Heart Disease
998,9988507,45,1,1,186,417,0,1,117,1,5.9,3,2,Heart Disease


In [5]:
# data validation code chunk from Mantram

In [5]:
# replaced 'label' with target

# Compute counts and percentages grouped by target
counts = train_heart.groupby('target').size().reset_index(name='count')

# Bar chart of counts
bar = alt.Chart(counts).mark_bar(stroke='black', strokeWidth=1).encode(
    x=alt.X('target:N', title='Heart Disease', sort=['No Heart Disease', 'Heart Disease']),
    y=alt.Y('count:Q', title='Count'),
    color=alt.Color(
        'target:N',
        title='Heart Disease',
        sort=['No Heart Disease', 'Heart Disease']
    ),
    tooltip=[
        alt.Tooltip('count:Q', title='Count')
    ]
).properties(
    title='Cases of Heart Disease',
    width=300,
    height=300
)

bar_labels = bar.mark_text(
    dy=-5,
    size=14,
).encode(
    text='count:Q'
)

bar_final = bar + bar_labels
bar_final

In [6]:
# got rid of label since target col is heart no heart now
# updated col names
num_cols = ['age', 'resting_bp', 'serum_cholesterol', 'max_heart_rate', 'old_peak']

charts = []

for col in num_cols:
    chart = alt.Chart(train_heart).mark_bar().encode(
        x=alt.X(f'{col}:Q', bin=alt.Bin(maxbins=30)),
        y=alt.Y('count()', title='Count'),
        tooltip=[alt.Tooltip(f'{col}:Q', title=col), alt.Tooltip('count()', title='Count')]
    ).properties(
        title=f'Distribution of {col}',
        width=300,
        height=250
    )
    charts.append(chart)

# rows of 2 charts each
rows = []
for i in range(0, len(charts), 2):
    row_charts = charts[i:i+2]
    row = alt.hconcat(*row_charts)
    rows.append(row)

final_chart = alt.vconcat(*rows).configure_legend(
    orient='top'
)

final_chart

In [7]:
# got rid of 'label' since target col is heart no heart now
charts = []

for col in num_cols:
    chart = alt.Chart(train_heart).mark_boxplot(size=20).encode(
        x=alt.X(f'{col}:Q', title=col),
        y=alt.Y('target:N', title='Heart Disease'),
        color=alt.Color('target:N', title='Heart Disease')
    ).properties(
        title=f'{col} vs Heart Disease',
        width=300,
        height=250)
    charts.append(chart)

rows = []
for i in range(0, len(charts), 2):
    row_charts = charts[i:i+2]
    row = alt.hconcat(*row_charts)
    rows.append(row)

final_chart = alt.vconcat(*rows).configure_legend(
    orient='top')

final_chart

In [8]:
# updated col names
cat_cols = ['gender','chest_pain','fasting_blood_sugar','resting_electro','exercise_angia','slope','num_major_vessels']

axis_titles = {
    'gender': 'Gender (0 = Female, 1 = Male)',
    'chest_pain': 'Chest Pain Type (0=Typical, 1=Atypical, 2=Non-anginal, 3=Asymptomatic)',
    'fasting_blood_sugar': 'Fasting Blood Sugar (0 = â‰¤120 mg/dl, 1 = >120 mg/dl)',
    'resting_electro': 'Resting ECG (0=Normal, 1=ST-T Abnormality, 2=LVH)',
    'exercise_angia': 'Exercise-Induced Angina (0 = No, 1 = Yes)',
    'slope': 'Slope of ST Segment (1=Upsl, 2=Flat, 3=Downsl)',
    'num_major_vessels': 'No. of Major Vessels (0-3)'}

charts = []

for col in axis_titles.keys():
    chart = alt.Chart(train_heart).mark_bar(size=30).encode(
        x=alt.X(
            f'{col}:N',
            title=axis_titles[col],     
            scale=alt.Scale(paddingInner=0.5, paddingOuter=0.5)),
        xOffset='target:N',
        y=alt.Y('count()', title='Count'),
        color=alt.Color('target:N', title='Heart Disease'),
        tooltip=[alt.Tooltip('count()', title='Count')]).properties(
        title=f'{col} vs Heart Disease',
        width=300,
        height=250)
    charts.append(chart)


rows = []
for i in range(0, len(charts), 2):
    rows.append(alt.hconcat(*charts[i:i+2]))

final_chart = alt.vconcat(*rows).configure_legend(
    orient='top')
final_chart

In [9]:
# corr only works for numerical col, since we changes 'target' values to 'heart disease' and 'no heart disease', 
# I created a copy of heart dataframe and change the target col back to 0 and 1 for viz purposes only.
train_heart_corr = train_heart.copy()
train_heart_corr['target'] = train_heart_corr['target'].replace({
    'Heart Disease': 1,
    'No Heart Disease': 0
})

corr_matrix = train_heart_corr[num_cols + cat_cols + ['target']].corr()

corr_long = corr_matrix.reset_index().melt(id_vars='index')
corr_long.columns = ['feature_x', 'feature_y', 'correlation']

base = alt.Chart(corr_long).encode(
    x=alt.X('feature_x:N', title='Feature'),
    y=alt.Y('feature_y:N', title='Feature'))

heatmap = base.mark_rect().encode(
    color=alt.Color(
        'correlation:Q',
        scale=alt.Scale(scheme='redblue', domain=[-1, 1])),
    tooltip=['feature_x', 'feature_y'])

text = base.mark_text(
    fontSize=12,
    color='black').encode(
    text=alt.Text('correlation:Q', format='.2f'))

final_chart = (heatmap + text).properties(
    title='Correlation Heatmap of All Features with Target',
    width=600,
    height=600)

final_chart


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



| Feature | Transformation | Explanation |
| --- | ----------- | ----- |
| patient_id | drop | Unique identifier for each patient. Not predictive, so removed from modeling. |
| age | scaling with `StandardScaler` |  A numeric feature with no missing values, ranging from 20 to 80. Scaling is recommended due to its distinct range compared to other numeric features.|
| chest_pain | one-hot encoding |  categorical column with no missing values |
| exercise_angia | Passthrough (binary) |  Categorical column kept as it is and will be handled by the model directly|
| fasting_blood_sugar | Passthrough (binary) | Categorical column kept as it is and will be handled by the model directly|
| gender | Passthrough (binary) | Categorical column kept as it is and will be handled by the model directly|
| max_heart_rate | scaling with `StandardScaler`  | A numeric feature with no missing values, ranging from 20 to 80. Scaling is recommended due to its distinct range compared to other numeric features.|
| num_major_vessels | scaling with`StandardScaler` | A numeric feature with no missing values, ranging from 0 to 3. Scaling is recommended due to its distinct range compared to other numeric features.|
| old_peak | scaling with `StandardScaler` | A numeric feature with no missing values, ranging from 0 to 3. Scaling is recommended due to its distinct range compared to other numeric features.|
| resting_bp | scaling with `StandardScaler` | A numeric feature with no missing values, ranging from 94 to 200. Scaling is recommended due to its distinct range compared to other numeric features. |
| resting_electro | one-hot encoding| categorical column with no missing values|
| serum_cholesterol |  scaling with `StandardScaler` | A numeric feature with no missing values, ranging from 0 to 602. Scaling is recommended due to its distinct range compared to other numeric features.| 
| slope |  ordinal encoding with `OrdinalEncoder` | categorical column with an ordinal relation among the features |

In [10]:
X_train = train_heart.drop(columns = ['target'])
y_train = train_heart['target']
X_test = test_heart.drop(columns = ['target'])
y_test = test_heart['target']

## uncomment in final ipynb
# X_train.to_csv('./data/processed/X_train_heart.csv') # updated csv file name
# X_test.to_csv('./data/processed/X_test_heart.csv') # updated csv file name

In [11]:
# updated col names
binary = ['gender','fasting_blood_sugar','exercise_angia']
ohe = ['chest_pain','resting_electro']
numerical = ['age','resting_bp','serum_cholesterol','max_heart_rate','old_peak','num_major_vessels']
ordinal = ['slope']
drop = ['patient_id']

# preprocessor = make_column_transformer(
#  (StandardScaler(), numerical),
#  (OneHotEncoder(), ohe),
#  (OrdinalEncoder(), ordinal),
#  ('passthrough', binary),
#  ('drop', drop)
# )

In [None]:
# no change only for sequence
X_train_preprocessed = preprocessor.fit_transform(X_train)
column_names = (
 numerical
 + ordinal
 + binary
 + preprocessor.named_transformers_['onehotencoder'].get_feature_names_out(ohe).tolist())
X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns = column_names)
X_train_preprocessed.head(5)

In [None]:
## uncomment in final ipynb
# X_train_preprocessed.to_csv('./data/processed/X_train_preprocessed.csv') # added X to file name

X_test_preprocessed = preprocessor.transform(X_test)
column_names = (
 numerical
 + ordinal
 + binary
 + preprocessor.named_transformers_['onehotencoder'].get_feature_names_out(ohe).tolist())
X_test_preprocessed = pd.DataFrame(X_test_preprocessed, columns = column_names)
# X_test_preprocessed.to_csv('./data/processed/X_test_preprocessed.csv') # added X to file name