In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import altair as alt

In [3]:
# import data
df = pd.read_csv('Cardiovascular_Disease_Dataset.csv')


In [4]:
df.head()

Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
0,103368,53,1,2,171,0,0,1,147,0,5.3,3,3,1
1,119250,40,1,0,94,229,0,1,115,0,3.7,1,1,0
2,119372,49,1,2,133,142,0,0,202,1,5.0,1,0,0
3,132514,43,1,0,138,295,1,1,153,0,3.2,2,2,1
4,146211,31,1,1,199,0,0,2,136,0,5.3,3,2,1


In [5]:
print(df.shape)
print(df.info())

(1000, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   patientid          1000 non-null   int64  
 1   age                1000 non-null   int64  
 2   gender             1000 non-null   int64  
 3   chestpain          1000 non-null   int64  
 4   restingBP          1000 non-null   int64  
 5   serumcholestrol    1000 non-null   int64  
 6   fastingbloodsugar  1000 non-null   int64  
 7   restingrelectro    1000 non-null   int64  
 8   maxheartrate       1000 non-null   int64  
 9   exerciseangia      1000 non-null   int64  
 10  oldpeak            1000 non-null   float64
 11  slope              1000 non-null   int64  
 12  noofmajorvessels   1000 non-null   int64  
 13  target             1000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 109.5 KB
None


In [15]:
train_df, test_df = train_test_split(df, test_size = 0.3, random_state = 123)

In [16]:
# summary statistics 
summary = train_df.describe(include="all")
summary

Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,4976398.0,48.818571,0.757143,0.941429,151.724286,311.631429,0.297143,0.744286,145.348571,0.495714,2.637857,1.521429,1.224286,0.568571
std,2860855.0,17.839451,0.429116,0.949813,29.837466,131.55456,0.457327,0.772588,35.053218,0.500339,1.717905,1.019606,0.990504,0.49563
min,103368.0,20.0,0.0,0.0,94.0,0.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0
25%,2460444.0,33.0,1.0,0.0,129.0,234.0,0.0,0.0,118.0,0.0,1.2,1.0,0.0,0.0
50%,4947916.0,48.0,1.0,1.0,148.5,318.0,0.0,1.0,146.5,0.0,2.4,2.0,1.0,1.0
75%,7432281.0,64.0,1.0,2.0,180.25,404.0,1.0,1.0,176.0,1.0,3.9,2.0,2.0,1.0
max,9990855.0,80.0,1.0,3.0,200.0,602.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,1.0


In [28]:
# distribution of the target variable

alt.Chart(train_df).mark_bar().encode(
    x=alt.X('target:N', title='Heart Disease'),
    y=alt.Y('count()', title='Count')
).properties(
    title='Distribution of Heart Disease',
    width=100,      
    height=300     
)

In [21]:
# distribution of numerical targets
num_cols = ['age', 'restingBP', 'serumcholestrol', 'maxheartrate', 'oldpeak']

for col in num_cols:
    chart = alt.Chart(train_df).mark_bar().encode(
        x=alt.X(f'{col}:Q', bin=alt.Bin(maxbins=30)),
        y='count()',
        tooltip=[col, 'count()']
    ).properties(title=f'Distribution of {col}')
    chart.display()

In [39]:
# continuous feature VS target
for col in num_cols:
    chart = alt.Chart(train_df).mark_boxplot(size=50).encode(
        x=f'{col}:Q',       
        y='target:N',         
        color='target:N'
    ).properties(
        title=f'{col} vs Heart Disease',
        width=300, 
        height=200  
    )
    chart.display()


In [43]:
# categorical features VS Target 
cat_cols = ['gender', 'chestpain', 'fastingbloodsugar',
            'restingrelectro', 'exerciseangia', 'slope',	'noofmajorvessels']

for col in cat_cols:
    chart = alt.Chart(df).mark_bar(size=40).encode(  
        x=alt.X(f'{col}:N', title=col),
        y=alt.Y('count()', title='Count'),
        color='target:N',
        tooltip=[col, 'target:N', 'count()']
    ).properties(
        title=f'{col} vs Heart Disease',
        width=200,   
        height=300   
    )

    chart.display()


In [46]:
# Compute correlation matrix
corr_matrix = train_df[num_cols + cat_cols + ['target']].corr()

# Convert to long format for Altair
corr_long = corr_matrix.reset_index().melt(id_vars='index')
corr_long.columns = ['feature_x', 'feature_y', 'correlation']

# Heatmap
heatmap = alt.Chart(corr_long).mark_rect().encode(
    x=alt.X('feature_x:N', title='Feature'),
    y=alt.Y('feature_y:N', title='Feature'),
    color=alt.Color('correlation:Q', scale=alt.Scale(scheme='redblue', domain=[-1,1])),
    tooltip=['feature_x', 'feature_y', 'correlation']
).properties(
    title='Correlation Heatmap of All Features with Target',
    width=600,
    height=600
)

heatmap


| Feature | Transformation | Explanation
| --- | ----------- | ----- |
| age | scaling with `StandardScaler` |  A numeric feature with no missing values, ranging from 29 to 76. Scaling is recommended due to its distinct range compared to other numeric features.|
| sex | one-hot encoding | |
| cp | one-hot encoding |  categorical column with no missing values|
| trestbps | one-hot encoding | categorical column with no missing values|
| chol | scaling with `StandardScaler` | A numeric feature with no missing values, ranging from 126 to 564. Scaling is recommended due to its distinct range compared to other numeric features. |
| fbs | one-hot encoding  | |
| restecg | one-hot encoding  | Categorical column no with missing values |
| thalach |  caling with `StandardScaler` | A numeric feature with no missing values, ranging from 71 to 202. Scaling is recommended due to its distinct range compared to other numeric features.|
| exang | drop  |  |
| oldpeak | scaling with `StandardScaler` | A numeric feature with no missing values, ranging from 71 to 202. Scaling is recommended due to its distinct range compared to other numeric features.|
| slope |  one-hot encoding | categorical column with no missing values | 
| ca |  one-hot encoding | categorical column with no missing values |
| thal |  one-hot encoding | categorical column with no missing values|