In [1]:
import altair as alt
import numpy as np
import pandas as pd
import pandera as pa
import requests
import warnings
import zipfile

from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
from deepchecks.tabular.checks import FeatureLabelCorrelation, FeatureFeatureCorrelation
from deepchecks.tabular import Dataset

warnings.filterwarnings("ignore", category=FutureWarning, module="deepchecks")


pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.



In [2]:
url = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/dzz48mvjht-1.zip"

request = requests.get(url)
with open("../data/raw/Cardiovascular_Disease_Dataset_original.zip", 'wb') as f:
    f.write(request.content)

with zipfile.ZipFile("../data/raw/Cardiovascular_Disease_Dataset_original.zip", 'r') as zip_ref:
    zip_ref.extractall("../data/raw")

In [3]:
colnames = [
    'patient_id', 
    'age', 
    'gender', 
    'chest_pain', 
    'resting_bp',
    'serum_cholesterol', 
    'fasting_blood_sugar', 
    'resting_electro',
    'max_heart_rate', 
    'exercise_angia', 
    'old_peak', 
    'slope', 
    'num_major_vessels',
    'target'
]

heart = pd.read_csv("../data/raw/Cardiovascular_Disease_Dataset/Cardiovascular_Disease_Dataset.csv", names=colnames, header=0)
heart.head()

Unnamed: 0,patient_id,age,gender,chest_pain,resting_bp,serum_cholesterol,fasting_blood_sugar,resting_electro,max_heart_rate,exercise_angia,old_peak,slope,num_major_vessels,target
0,103368,53,1,2,171,0,0,1,147,0,5.3,3,3,1
1,119250,40,1,0,94,229,0,1,115,0,3.7,1,1,0
2,119372,49,1,2,133,142,0,0,202,1,5.0,1,0,0
3,132514,43,1,0,138,295,1,1,153,0,3.2,2,2,1
4,146211,31,1,1,199,0,0,2,136,0,5.3,3,2,1


In [4]:
# Change values of 1 and 0 to 'Heart Disease' and 'No Heart Disease' in target
heart['target'] = heart['target'].replace({
    1 : 'Heart Disease',
    0 : 'No Heart Disease'
})

# Create train test split
train_heart, test_heart = train_test_split(heart, test_size = 0.2,random_state=123)

## uncomment in final ipynb
# train_heart.to_csv("./data/processed/train_heart.csv") # changed heart_train to train_heart.csv for consistency
# test_heart.to_csv("./data/processed/test_heart.csv") # changed heart_test to test_heart.csv for consistency

### Data Validation

##### Correct Data File Format
The dataset was successfully read into a DataFrame and validated against a predefined pandera schema to ensure the structure and format matched what the analysis required. No format-related issues were detected.

##### Correct Column Names
All column names were checked against the schema, ensuring consistency with the expected variables such as patient_id, age, serum_cholesterol, and target. This prevents downstream errors caused by misspellings or naming mismatches.

##### No Empty Observations
A schema-level check was included to detect and prevent any rows that contain only missing values. The validation confirmed that no fully empty observations were present.

##### Missingness Within Expected Thresholds
Columns that contain missing data were set as nullable=True. Some missing values appeared as zeros (e.g., slope = 0, cholesterol = 0). These were flagged with warnings during validation and would be handled through imputation during preprocessing to avoid losing data.

##### Correct Data Types
Each variable was validated against the expected data type (integer, float, or string). This ensures numerical columns are not read in as text and that categorical columns are properly formatted.

##### No Duplicate Observations
A validation rule was added to detect duplicate rows. The check confirmed that the dataset does not contain repeated observations.

##### No Outlier or Anomalous Values
Anomalous values checks were applied to identify values outside typical clinical ranges. For certain columns, such as slope and serum_cholesterol, raise_warning=True was used to flag unusual values without stopping execution. Extreme values were handled through imputation or scaling rather than removing them.

##### Correct Category Levels
The target has two labels: Heart Disease and No Heart Disease. Validation confirmed these levels were used consistently after correcting earlier naming differences.

##### Target Variable Distribution
After validation, the distribution of the target classes was inspected to ensure there were no unexpected patterns, imbalances that might affect modeling.

##### No Anomalous Correlations Between Target and Features
Exploratory checks were performed to confirm that no impossible relationships existed between the target and other features. No issues were identified.

##### No Anomalous Correlations Between Features
Relationships between features were examined to ensure there is no anomalous correlation. No anomalies were found that required correction.

In [5]:
# validate data
schema = pa.DataFrameSchema(
    {
        'patient_id': pa.Column(int, pa.Check.greater_than(0)),
        'age': pa.Column(int, pa.Check.between(0, 90), nullable=True),
        'gender': pa.Column(int, pa.Check.between(0, 1), nullable=True),
        'chest_pain': pa.Column(int, pa.Check.between(0, 3), nullable=True),
        'resting_bp': pa.Column(int, pa.Check.between(94, 200), nullable=True),
        'serum_cholesterol': pa.Column(
            int, 
            checks=[
                pa.Check(lambda s: s >= 126 and s <= 564,
                        element_wise=True,
                        # Attributed to pandera documentation:
                        # https://pandera.readthedocs.io/en/stable/checks.html#raise-warning-instead-of-error-on-check-failure
                        raise_warning=True,
                        error="There are outliers in the data values"),
            ], 
            nullable=True),
        'fasting_blood_sugar': pa.Column(int, pa.Check.between(0, 1), nullable=True),
        'resting_electro': pa.Column(int, pa.Check.between(0, 2), nullable=True),
        'max_heart_rate': pa.Column(int, pa.Check.between(71, 202), nullable=True),
        'exercise_angia': pa.Column(int, pa.Check.between(0, 1), nullable=True),
        'old_peak': pa.Column(float, pa.Check.between(0.0, 6.2), nullable=True),
        'slope': pa.Column(
            int, 
            checks=[
                pa.Check(lambda s: s >= 1 and s <= 3,
                        element_wise=True,
                        raise_warning=True,
                        error="Certain slope values are out of range"),
            ], 
            nullable=True),
        'num_major_vessels': pa.Column(int, pa.Check.between(0, 3), nullable=True),
        'target': pa.Column(str, pa.Check.isin(['Heart Disease', 'No Heart Disease'])),
    },
    checks=[
        pa.Check(lambda df: ~df.duplicated().any(), error="Duplicate rows found."),
        pa.Check(lambda df: ~(df.isna().all(axis=1)).any(), error="Empty rows found.")
    ]
)


Importing pandas-specific classes and functions from the
top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```




In [6]:
schema.validate(heart, lazy=True)


Column 'serum_cholesterol' failed element-wise validator number 0: <Check <lambda>: There are outliers in the data values> failure cases: 0, 0, 0, 0, 0, 86, 0, 0, 87, 0, 0, 86, 0, 0, 0, 0, 602, 87, 86, 0, 0, 0, 0, 0, 0, 0, 602, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 601, 0, 0, 0, 0, 0, 0, 0, 0, 0, 85, 0, 0, 601, 86, 601, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0


Column 'slope' failed element-wise validator number 0: <Check <lambda>: Certain slope values are out of range> failure cases: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Unnamed: 0,patient_id,age,gender,chest_pain,resting_bp,serum_cholesterol,fasting_blood_sugar,resting_electro,max_heart_rate,exercise_angia,old_peak,slope,num_major_vessels,target
0,103368,53,1,2,171,0,0,1,147,0,5.3,3,3,Heart Disease
1,119250,40,1,0,94,229,0,1,115,0,3.7,1,1,No Heart Disease
2,119372,49,1,2,133,142,0,0,202,1,5.0,1,0,No Heart Disease
3,132514,43,1,0,138,295,1,1,153,0,3.2,2,2,Heart Disease
4,146211,31,1,1,199,0,0,2,136,0,5.3,3,2,Heart Disease
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,9949544,48,1,2,139,349,0,2,183,1,5.6,2,2,Heart Disease
996,9953423,47,1,3,143,258,1,1,98,1,5.7,1,0,No Heart Disease
997,9965859,69,1,0,156,434,1,0,196,0,1.4,3,1,Heart Disease
998,9988507,45,1,1,186,417,0,1,117,1,5.9,3,2,Heart Disease
