In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import altair as alt

In [2]:
# import data
df = pd.read_csv('Cardiovascular_Disease_Dataset.csv')


In [3]:
df.head()

Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
0,103368,53,1,2,171,0,0,1,147,0,5.3,3,3,1
1,119250,40,1,0,94,229,0,1,115,0,3.7,1,1,0
2,119372,49,1,2,133,142,0,0,202,1,5.0,1,0,0
3,132514,43,1,0,138,295,1,1,153,0,3.2,2,2,1
4,146211,31,1,1,199,0,0,2,136,0,5.3,3,2,1


In [4]:
print(df.shape)
print(df.info())

(1000, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   patientid          1000 non-null   int64  
 1   age                1000 non-null   int64  
 2   gender             1000 non-null   int64  
 3   chestpain          1000 non-null   int64  
 4   restingBP          1000 non-null   int64  
 5   serumcholestrol    1000 non-null   int64  
 6   fastingbloodsugar  1000 non-null   int64  
 7   restingrelectro    1000 non-null   int64  
 8   maxheartrate       1000 non-null   int64  
 9   exerciseangia      1000 non-null   int64  
 10  oldpeak            1000 non-null   float64
 11  slope              1000 non-null   int64  
 12  noofmajorvessels   1000 non-null   int64  
 13  target             1000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 109.5 KB
None


In [5]:
train_df, test_df = train_test_split(df, test_size = 0.3, random_state = 123)

In [6]:
# summary statistics 
summary = train_df.describe(include="all")
summary

Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,4976398.0,48.818571,0.757143,0.941429,151.724286,311.631429,0.297143,0.744286,145.348571,0.495714,2.637857,1.521429,1.224286,0.568571
std,2860855.0,17.839451,0.429116,0.949813,29.837466,131.55456,0.457327,0.772588,35.053218,0.500339,1.717905,1.019606,0.990504,0.49563
min,103368.0,20.0,0.0,0.0,94.0,0.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0
25%,2460444.0,33.0,1.0,0.0,129.0,234.0,0.0,0.0,118.0,0.0,1.2,1.0,0.0,0.0
50%,4947916.0,48.0,1.0,1.0,148.5,318.0,0.0,1.0,146.5,0.0,2.4,2.0,1.0,1.0
75%,7432281.0,64.0,1.0,2.0,180.25,404.0,1.0,1.0,176.0,1.0,3.9,2.0,2.0,1.0
max,9990855.0,80.0,1.0,3.0,200.0,602.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,1.0


In [7]:
# Compute counts and percentages grouped by target
counts = train_df.groupby('target').size().reset_index(name='count')
counts['percentage'] = (counts['count'] / counts['count'].sum() * 100).round(1)
counts['label'] = counts['target'].map({0: 'No Heart Disease', 1: 'Heart Disease'})

# Base pie chart
pie = alt.Chart(counts).mark_arc(innerRadius=0, stroke='black', strokeWidth=1).encode(
    theta=alt.Theta(field='count', type='quantitative'),
    color=alt.Color(field='label', type='nominal', title='Heart Disease'),
    tooltip=[
        alt.Tooltip('count:Q', title='Count'),
        alt.Tooltip('percentage:Q', title='Percentage')
    ]
).properties(
    title='Heart Disease %',
    width=300,
    height=300
)

# Place percentage labels at slice centers
pie_labels = alt.Chart(counts).mark_text(radius=100, size=14, color='black').encode(
    theta=alt.Theta(field='count', type='quantitative', stack='zero'),
    text=alt.Text('percentage:Q', format='.1f'),
    color=alt.value('black')
)

pie_final = pie + pie_labels

# Bar chart
bar = alt.Chart(counts).mark_bar(stroke='black', strokeWidth=1).encode(
    x=alt.X('label:N', title='Heart Disease'),
    y=alt.Y('count:Q', title='Count'),
    color=alt.Color('label:N', title='Heart Disease'),
    tooltip=[
        alt.Tooltip('count:Q', title='Count')
    ]
).properties(
    title='Cases of Heart Disease',
    width=300,
    height=300
)

# Add text labels on bars
bar_labels = bar.mark_text(
    dy=-5,
    size=14
).encode(
    text='count:Q'
)

bar_final = bar + bar_labels

# Combine pie and bar side by side
final_chart = alt.hconcat(pie_final, bar_final)

final_chart


In [8]:
num_cols = ['age', 'restingBP', 'serumcholestrol', 'maxheartrate', 'oldpeak', 'noofmajorvessels']

charts = []

for col in num_cols:
    chart = alt.Chart(train_df).mark_bar().encode(
        x=alt.X(f'{col}:Q', bin=alt.Bin(maxbins=30)),
        y=alt.Y('count()', title='Count'),
        tooltip=[alt.Tooltip(f'{col}:Q', title=col), alt.Tooltip('count()', title='Count')]
    ).properties(
        title=f'Distribution of {col}',
        width=300,
        height=250
    )
    charts.append(chart)

# rows of 2 charts each
rows = []
for i in range(0, len(charts), 2):
    row_charts = charts[i:i+2]   # take 2 charts (or remaining 1)
    row = alt.hconcat(*row_charts)
    rows.append(row)

final_chart = alt.vconcat(*rows).configure_legend(
    orient='top'
)

final_chart


In [9]:
# Map target to descriptive labels
train_df['label'] = train_df['target'].map({0: 'No Heart Disease', 1: 'Heart Disease'})

charts = []

# Create boxplots for each continuous feature
for col in num_cols:
    chart = alt.Chart(train_df).mark_boxplot(size=20).encode(
        x=alt.X(f'{col}:Q', title=col),
        y=alt.Y('label:N', title='Heart Disease'),  # use descriptive label
        color=alt.Color('label:N', title='Heart Disease')
    ).properties(
        title=f'{col} vs Heart Disease',
        width=300,
        height=250
    )
    charts.append(chart)

# Arrange 2 charts per row
rows = []
for i in range(0, len(charts), 2):
    row_charts = charts[i:i+2]
    row = alt.hconcat(*row_charts)
    rows.append(row)

# Stack all rows vertically
final_chart = alt.vconcat(*rows).configure_legend(
    orient='top'
)

final_chart


In [10]:
# descriptive labels
cat_cols = ['gender','chestpain','fastingbloodsugar','restingrelectro','exerciseangia','slope']
    
train_df['label'] = train_df['target'].map({0: 'No Heart Disease', 1: 'Heart Disease'})

charts = []

# Create grouped bar charts for each categorical feature
for col in cat_cols:
    chart = alt.Chart(train_df).mark_bar(size=30).encode(
        x=alt.X(
            f'{col}:N',
            title=col,
            scale=alt.Scale(paddingInner=0.5, paddingOuter=0.5)
        ),
        xOffset='label:N',           # use descriptive label for grouping
        y=alt.Y('count()', title='Count'),
        color=alt.Color('label:N', title='Heart Disease'),
        tooltip=[alt.Tooltip('count()', title='Count')]
    ).properties(
        title=f'{col} vs Heart Disease',
        width=300,
        height=250
    )
    
    charts.append(chart)

# Arrange 2 charts per row
rows = []
for i in range(0, len(charts), 2):
    row_charts = charts[i:i+2]
    row = alt.hconcat(*row_charts)
    rows.append(row)

# Stack all rows vertically
final_chart = alt.vconcat(*rows).configure_legend(
    orient='top'
)

final_chart


In [11]:
# Compute correlation matrix
corr_matrix = train_df[num_cols + cat_cols + ['target']].corr()

# Convert to long format for Altair
corr_long = corr_matrix.reset_index().melt(id_vars='index')
corr_long.columns = ['feature_x', 'feature_y', 'correlation']

# Base heatmap
base = alt.Chart(corr_long).encode(
    x=alt.X('feature_x:N', title='Feature'),
    y=alt.Y('feature_y:N', title='Feature')
)

heatmap = base.mark_rect().encode(
    color=alt.Color(
        'correlation:Q',
        scale=alt.Scale(scheme='redblue', domain=[-1, 1])
    ),
    tooltip=['feature_x', 'feature_y', 'correlation']
)

# Text layer (2 decimal places)
text = base.mark_text(
    fontSize=12,
    color='black'
).encode(
    text=alt.Text('correlation:Q', format='.2f')
)

# Combine layers
final_chart = (heatmap + text).properties(
    title='Correlation Heatmap of All Features with Target',
    width=600,
    height=600
)

final_chart


| Feature | Transformation | Explanation
| --- | ----------- | ----- |
| patientid | drop | Unique identifier for each patient; not predictive, so removed from modeling. |
| age | scaling with `StandardScaler` |  A numeric feature with no missing values, ranging from 20 to 80. Scaling is recommended due to its distinct range compared to other numeric features.|
| chestpain | one-hot encoding |  categorical column with no missing values |
| exerciseangia | one-hot encoding |  categorical column with no missing values|
| fastingbloodsugar | one-hot encoding | categorical column with no missing values|
| gender | one-hot encoding | categorical column with no missing values|
| maxheartrate | scaling with `StandardScaler`  | A numeric feature with no missing values, ranging from 20 to 80. Scaling is recommended due to its distinct range compared to other numeric features.|
| noofmajorvessels | scaling with`StandardScaler` | A numeric feature with no missing values, ranging from 0 to 3. Scaling is recommended due to its distinct range compared to other numeric features.|
| oldpeak | scaling with `StandardScaler` | A numeric feature with no missing values, ranging from 0 to 3. Scaling is recommended due to its distinct range compared to other numeric features.|
| restingBP | scaling with `StandardScaler` | A numeric feature with no missing values, ranging from 94 to 200. Scaling is recommended due to its distinct range compared to other numeric features. |
| restingelectro | one-hot encoding| categorical column with no missing values|
| serumcholestrol |  scaling with `StandardScaler` | A numeric feature with no missing values, ranging from 0 to 602. Scaling is recommended due to its distinct range compared to other numeric features.| 
| slope |  one-hot encoding | categorical column with no missing values |