In [69]:

# change above to theme setting using pio
import plotly.io as pio
pio.templates.default = "plotly_dark"

# change font (size = 14, monospace)
pio.templates[pio.templates.default].layout.font.size = 14
pio.templates[pio.templates.default].layout.font.family = "monospace"

In [70]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures

# Load your dataset
heart_disease_df = pd.read_csv('data/HeartDiseaseTrain-Test.csv')
heart_disease_df

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,rest_ecg,Max_heart_rate,exercise_induced_angina,oldpeak,slope,vessels_colored_by_flourosopy,thalassemia,target
0,52,Male,Typical angina,125,212,Lower than 120 mg/ml,ST-T wave abnormality,168,No,1.0,Downsloping,Two,Reversable Defect,0
1,53,Male,Typical angina,140,203,Greater than 120 mg/ml,Normal,155,Yes,3.1,Upsloping,Zero,Reversable Defect,0
2,70,Male,Typical angina,145,174,Lower than 120 mg/ml,ST-T wave abnormality,125,Yes,2.6,Upsloping,Zero,Reversable Defect,0
3,61,Male,Typical angina,148,203,Lower than 120 mg/ml,ST-T wave abnormality,161,No,0.0,Downsloping,One,Reversable Defect,0
4,62,Female,Typical angina,138,294,Greater than 120 mg/ml,ST-T wave abnormality,106,No,1.9,Flat,Three,Fixed Defect,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,Male,Atypical angina,140,221,Lower than 120 mg/ml,ST-T wave abnormality,164,Yes,0.0,Downsloping,Zero,Fixed Defect,1
1021,60,Male,Typical angina,125,258,Lower than 120 mg/ml,Normal,141,Yes,2.8,Flat,One,Reversable Defect,0
1022,47,Male,Typical angina,110,275,Lower than 120 mg/ml,Normal,118,Yes,1.0,Flat,One,Fixed Defect,0
1023,50,Female,Typical angina,110,254,Lower than 120 mg/ml,Normal,159,No,0.0,Downsloping,Zero,Fixed Defect,1


In [71]:
# map column names to full names
column_mapping = {
    'age': 'Age',
    'sex': 'Gender',
    'chest_pain_type': 'Chest Pain Type',
    'resting_blood_pressure': 'Resting Blood Pressure',
    'cholestoral': 'Cholesterol',
    'fasting_blood_sugar': 'Fasting Blood Sugar',
    'rest_ecg': 'Rest ECG',
    'Max_heart_rate': 'Max Heart Rate',
    'exercise_induced_angina': 'Exercise Induced Angina',
    'oldpeak': 'Oldpeak',
    'slope': 'Slope',
    'vessels_colored_by_flourosopy': 'Vessels Colored by Fluoroscopy',
    'thalassemia': 'Thalassemia',
    'target': 'Heart Disease'
}

# iterate over the column names and print the full names
map = {col: column_mapping[col] for col in heart_disease_df.columns}


## Checking for Missing Values

In [72]:
# Check for missing values in the dataset
missing_values = heart_disease_df.isnull().sum()

missing_values


age                              0
sex                              0
chest_pain_type                  0
resting_blood_pressure           0
cholestoral                      0
fasting_blood_sugar              0
rest_ecg                         0
Max_heart_rate                   0
exercise_induced_angina          0
oldpeak                          0
slope                            0
vessels_colored_by_flourosopy    0
thalassemia                      0
target                           0
dtype: int64

## Checking for duplicactes

In [73]:
# Check for duplicates
duplicates = heart_disease_df.duplicated().sum()

duplicates

723

### remove duplicates

In [74]:
# Remove duplicates
heart_disease_clean_df = heart_disease_df.drop_duplicates()

# Check the new shape after removing duplicates
cleaned_dataset_shape = heart_disease_clean_df.shape

cleaned_dataset_shape


(302, 14)

In [75]:
# use cleaned heart_disease_df
heart_disease_df = heart_disease_clean_df

## Exploratory Data Analysis (EDA)
* Data Types
* Shape of the Dataset
* Descriptive Statistics
* Histograms of all Variables

In [76]:
# Check the data types of each column
data_types = heart_disease_df.dtypes

data_types


age                                int64
sex                               object
chest_pain_type                   object
resting_blood_pressure             int64
cholestoral                        int64
fasting_blood_sugar               object
rest_ecg                          object
Max_heart_rate                     int64
exercise_induced_angina           object
oldpeak                          float64
slope                             object
vessels_colored_by_flourosopy     object
thalassemia                       object
target                             int64
dtype: object

In [77]:
# Get the shape of the dataset
dataset_shape = heart_disease_df.shape

dataset_shape


(302, 14)

### Descriptive Statistics


In [78]:
# Get descriptive statistics for numeric columns
numeric_stats = heart_disease_df.describe()

numeric_stats


Unnamed: 0,age,resting_blood_pressure,cholestoral,Max_heart_rate,oldpeak,target
count,302.0,302.0,302.0,302.0,302.0,302.0
mean,54.42053,131.602649,246.5,149.569536,1.043046,0.543046
std,9.04797,17.563394,51.753489,22.903527,1.161452,0.49897
min,29.0,94.0,126.0,71.0,0.0,0.0
25%,48.0,120.0,211.0,133.25,0.0,0.0
50%,55.5,130.0,240.5,152.5,0.8,1.0
75%,61.0,140.0,274.75,166.0,1.6,1.0
max,77.0,200.0,564.0,202.0,6.2,1.0


### Histograms

#### Numerical variables

#### Categorical variables

* sex
* chest_pain_type
* fasting_blood_sugar
* rest_ecg
* exercise_induced_angina
* slope
* vessels_colored_by_flourosopy
* thalassemia

In [80]:
# !pip install palettable

In [81]:
import palettable

In [97]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Titles for plots
titles = {
    'sex': 'Distribution of Gender (sex)',
    'chest_pain_type': 'Distribution of Chest Pain Type',
    'fasting_blood_sugar': 'Distribution of Fasting Blood Sugar',
    'rest_ecg': 'Distribution of Resting ECG Results',
    'exercise_induced_angina': 'Distribution of Exercise-Induced Angina',
    'slope': 'Distribution of Slope of Peak Exercise ST Segment',
    'vessels_colored_by_flourosopy': 'Distribution of Vessels Colored by Fluoroscopy',
    'thalassemia': 'Distribution of Thalassemia'
}

# List of categorical variables
categorical_columns = [
    'sex', 'chest_pain_type', 'fasting_blood_sugar', 'rest_ecg',
    'exercise_induced_angina', 'slope', 'vessels_colored_by_flourosopy', 'thalassemia'
]

color_palettes = [
    "#ffd8ec",  # Light pinkish-beige
    "#ffa8cb",  # Bright pink
    "#e5000c",  # Deep red
    "#784283",  # Dark purple
    "#ddd690",  # Light brown
    "#ffefd5",  # Ivory
    "#ff7f50",  # Coral
    "#8b4513",  # Saddle brown
]


In [103]:


# use above except use xaxis titles from column_mapping
def plot_histograms_plotly(df, columns, color_palette):
    fig = make_subplots(rows=len(columns)//2, cols=2, subplot_titles=[column_mapping[col] for col in columns])

    for idx, col in enumerate(columns):
        fig.add_trace(
            go.Histogram(
                x=df[col],
                marker_color=color_palette[idx % len(color_palette)],
                nbinsx=20
            ),
            row=idx // 2 + 1,
            col=idx % 2 + 1
        )

    fig.update_layout(
        title='Histograms of All Variables',
        template='plotly_dark',
        font=dict(
            family='Courier New, monospace',
            size=14,
            color='white'
        ),
        height=800,
        showlegend=False
    )

    # change hover tooltip
    fig.update_traces(hoverinfo='x+y')
    fig.show()

# Columns for histograms (numeric)
numeric_columns = ['age', 'resting_blood_pressure', 'cholestoral', 'Max_heart_rate', 'oldpeak', 'target']

# Plot histograms
plot_histograms_plotly(heart_disease_clean_df, numeric_columns, color_palettes)



In [105]:



# Function to create bar plots using plotly
def plot_categorical_variable_plotly(heart_disease_df, column, title, color):
    value_counts = heart_disease_df[column].value_counts()
    fig = go.Figure(
        data=[go.Bar(
            x=value_counts.index,
            y=value_counts.values,
            text=value_counts.values,
            textposition='auto',
            marker_color=color,
                        hovertemplate='x: %{x}<br>y: %{y}<extra></extra>'
        )]
    )
    fig.update_layout(
        title=title,
        xaxis_title=column,
        yaxis_title='Count',
        template='plotly_dark',
        font=dict(
            family='Courier New, monospace',
            size=14,
            color='white'
        )
    )
    fig.show()

# create color palette using palettable
# color_palettes = palettable.tableau.TableauMedium_10.hex_colors

# Plot each categorical variable using Plotly
for idx, column in enumerate(categorical_columns):
    plot_categorical_variable_plotly(heart_disease_clean_df, column, titles[column], color_palettes[idx])

# Histograms using Plotly
def plot_histograms_plotly(df, columns, color_palette):
    fig = make_subplots(rows=len(columns)//2, cols=2, subplot_titles=columns)

    for idx, col in enumerate(columns):
        fig.add_trace(
            go.Histogram(
                x=df[col],
                marker_color=color_palette[idx % len(color_palette)],
                nbinsx=20
            ),
            row=idx // 2 + 1,
            col=idx % 2 + 1
        )

    fig.update_layout(
        title='Histograms of All Variables',
        template='plotly_dark',
        font=dict(
            family='Courier New, monospace',
            size=14,
            color='white'
        ),
        height=800,
        showlegend=False
    )
    fig.show()




##### Analysis of Categorical Variable Distribution

- **sex**:
    
    - Majority of patients are male.
- **chest\_pain\_type**:
    
    - Most patients have `Asymptomatic` or `Non-anginal` chest pain.
- **fasting\_blood\_sugar**:
    
    - Majority have fasting blood sugar less than 120 mg/dl.
- **rest\_ecg**:
    
    - Majority of patients have `Normal` or `ST-T wave abnormality` results.
- **exercise\_induced\_angina**:
    
    - Most patients do not have exercise-induced angina.
- **slope**:
    
    - Most patients have `Upsloping` or `Flat` slope.
- **vessels\_colored\_by\_flourosopy**:
    
    - Majority have no major vessels colored by fluoroscopy.
- **thalassemia**:
    
    - Most patients have `Reversible defect` or `Fixed defect`.

### Violin Plots

In [83]:
def plot_violin_plotly(df, x_col, y_col, title, color_palette):
    fig = px.violin(
        df,
        x=x_col,
        y=y_col,
        color=x_col,
        box=True,
        points="all",
        color_discrete_sequence=color_palette,
        template='plotly_dark'
    )
    fig.update_layout(
        title=title,
        xaxis_title=x_col,
        yaxis_title=y_col,
        font=dict(
            family='Courier New, monospace',
            size=14,
            color='white'
        )
    )
    # Hide the legend
    fig.update_layout(showlegend=False)
    return fig

In [84]:
# Variables for violin plots
violin_vars = [
    ('sex', 'age', 'Age Distribution by Sex'),
    ('chest_pain_type', 'resting_blood_pressure', 'Resting Blood Pressure by Chest Pain Type'),
    ('fasting_blood_sugar', 'cholestoral', 'Cholesterol Levels by Fasting Blood Sugar Levels'),
    ('rest_ecg', 'Max_heart_rate', 'Maximum Heart Rate by Rest ECG Results'),
    ('exercise_induced_angina', 'oldpeak', 'ST Depression (Oldpeak) by Exercise-Induced Angina'),
    ('slope', 'age', 'Age Distribution by Slope of Peak Exercise ST Segment')
]

In [85]:
# print current working directory
import os
os.getcwd()

'/home/gr00stl/Nextcloud/UNI/wdwa/heart_disease'

In [86]:
# Generate violin plots for each pair
for idx, (x_col, y_col, title) in enumerate(violin_vars):
    fig = plot_violin_plotly(heart_disease_clean_df, x_col, y_col, title, [color_palettes[idx % len(color_palettes)]])
    # save the plot - title is used as the filename
    fig.write_image(f"eda/violin_plots/{title}.png")
    fig.show()



#### 1 Violin Plot of ST Depression (Oldpeak) by Exercise-Induced Angina

ST Depression (Oldpeak) by Exercise-Induced Angina

- **No Exercise-Induced Angina**: The distribution is centered around a median Oldpeak value of approximately 0.5, with a wide spread of values up to 6. There are several outliers above 4.
- **Yes Exercise-Induced Angina**: The distribution is centered around a median Oldpeak value of approximately 1.5, with a narrower spread compared to the "No" group. There are fewer outliers.
- **Observation**: Individuals with exercise-induced angina tend to have higher Oldpeak values, indicating more significant ST depression during exercise.

#### 2 Violin Plot of Resting Blood Pressure by Chest Pain Type

Resting Blood Pressure by Chest Pain Type

- **Typical Angina**: The median resting blood pressure is around 130 mmHg, with a wide spread of values.
- **Atypical Angina**: The median resting blood pressure is slightly lower, around 125 mmHg, with a narrower spread.
- **Non-Anginal Pain**: The median resting blood pressure is around 130 mmHg, similar to typical angina, but with a wider spread.
- **Asymptomatic**: The median resting blood pressure is around 140 mmHg, with a wide spread of values.
- **Observation**: Individuals with asymptomatic chest pain tend to have higher resting blood pressure compared to other chest pain types.

#### 3 Violin Plot of Age Distribution by Slope of Peak Exercise ST Segment

Age Distribution by Slope of Peak Exercise ST Segment

- **Downsloping**: The median age is around 60, with a wide spread of ages from 30 to 77.
- **Upsloping**: The median age is around 55, with a narrower spread compared to downsloping.
- **Flat**: The median age is around 58, with a wide spread of ages.
- **Observation**: The age distribution varies slightly with the slope of the peak exercise ST segment, with downsloping and flat slopes showing a wider age range compared to upsloping.

#### 4 Violin Plot of Maximum Heart Rate by Rest ECG Results

Maximum Heart Rate by Rest ECG Results
- **ST-T Wave Abnormality**: The distribution is centered around a median maximum heart rate of approximately 140 bpm, with a wide spread of values from 60 to 200 bpm.
- **Normal**: The distribution is centered around a median maximum heart rate of approximately 160 bpm, with a similar wide spread of values.
- **Left Ventricular Hypertrophy**: The distribution is centered around a median maximum heart rate of approximately 120 bpm, with a narrower spread compared to the other groups.
- **Observation**: Individuals with normal ECG results tend to have higher maximum heart rates, while those with left ventricular hypertrophy tend to have lower maximum heart rates.

#### 5 Violin Plot of Cholesterol Levels by Fasting Blood Sugar Levels

Cholesterol Levels by Fasting Blood Sugar Levels
- **Lower than 120 mg/ml**: The distribution is centered around a median cholesterol level of approximately 240 mg/dL, with a wide spread of values from 100 to 600 mg/dL.
- **Greater than 120 mg/ml**: The distribution is centered around a median cholesterol level of approximately 260 mg/dL, with a narrower spread compared to the "Lower" group.
- **Observation**: Individuals with fasting blood sugar levels greater than 120 mg/ml tend to have slightly higher cholesterol levels compared to those with lower fasting blood sugar levels.

#### 6 Violin Plot of Age Distribution by Sex

Age Distribution by Sex
- **Male**: The distribution is centered around a median age of approximately 55 years, with a wide spread of ages from 30 to 80 years.
- **Female**: The distribution is centered around a median age of approximately 60 years, with a similar wide spread of ages.
- **Observation**: The age distribution is slightly higher for females compared to males, with both groups showing a wide range of ages.

#### 7 Summary

- **Oldpeak by Exercise-Induced Angina**: Higher Oldpeak values in individuals with exercise-induced angina.
- **Resting Blood Pressure by Chest Pain Type**: Higher resting blood pressure in individuals with asymptomatic chest pain.
- **Age by Slope of Peak Exercise ST Segment**: Downsloping and flat slopes show a wider age range compared to upsloping.
- **Maximum Heart Rate by Rest ECG Results**: Higher maximum heart rates are associated with normal ECG results, while lower maximum heart rates are associated with left ventricular hypertrophy.
- **Cholesterol Levels by Fasting Blood Sugar Levels**: Higher cholesterol levels are associated with fasting blood sugar levels greater than 120 mg/ml.
- **Age Distribution by Sex**: Females tend to be slightly older on average compared to males.

In [87]:
heart_disease_df

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholestoral,fasting_blood_sugar,rest_ecg,Max_heart_rate,exercise_induced_angina,oldpeak,slope,vessels_colored_by_flourosopy,thalassemia,target
0,52,Male,Typical angina,125,212,Lower than 120 mg/ml,ST-T wave abnormality,168,No,1.0,Downsloping,Two,Reversable Defect,0
1,53,Male,Typical angina,140,203,Greater than 120 mg/ml,Normal,155,Yes,3.1,Upsloping,Zero,Reversable Defect,0
2,70,Male,Typical angina,145,174,Lower than 120 mg/ml,ST-T wave abnormality,125,Yes,2.6,Upsloping,Zero,Reversable Defect,0
3,61,Male,Typical angina,148,203,Lower than 120 mg/ml,ST-T wave abnormality,161,No,0.0,Downsloping,One,Reversable Defect,0
4,62,Female,Typical angina,138,294,Greater than 120 mg/ml,ST-T wave abnormality,106,No,1.9,Flat,Three,Fixed Defect,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,68,Female,Non-anginal pain,120,211,Lower than 120 mg/ml,Normal,115,No,1.5,Flat,Zero,Fixed Defect,1
733,44,Female,Non-anginal pain,108,141,Lower than 120 mg/ml,ST-T wave abnormality,175,No,0.6,Flat,Zero,Fixed Defect,1
739,52,Male,Typical angina,128,255,Lower than 120 mg/ml,ST-T wave abnormality,161,Yes,0.0,Downsloping,One,Reversable Defect,0
843,59,Male,Asymptomatic,160,273,Lower than 120 mg/ml,Normal,125,No,0.0,Downsloping,Zero,Fixed Defect,0


## Parallel coordinates plot

till about ~150 of max heart rate there are majority of black coordinates

In [88]:
# compute [average, 1/4 quartile, 2/4 quartile median, 3/4 quartile] for columns


# change the function to return a dataframe
def compute_quantiles(df):
    quantiles = []
    for col in df.columns:

        # check if column is numeric
        if df[col].dtype not in ['int64', 'float64']:
            # skip non-numeric columns
            continue

        # compute quantiles

        q1 = df[col].quantile(0.25)
        median = df[col].quantile(0.5)
        q3 = df[col].quantile(0.75)
        avg = df[col].mean()
        quantiles.append([col, avg, q1, median, q3])
    return pd.DataFrame(quantiles, columns=['Column', 'Average', 'Q1', 'Median', 'Q3'])

# compute quantiles

statistics = compute_quantiles(heart_disease_df)
statistics



Unnamed: 0,Column,Average,Q1,Median,Q3
0,age,54.42053,48.0,55.5,61.0
1,resting_blood_pressure,131.602649,120.0,130.0,140.0
2,cholestoral,246.5,211.0,240.5,274.75
3,Max_heart_rate,149.569536,133.25,152.5,166.0
4,oldpeak,1.043046,0.0,0.8,1.6
5,target,0.543046,0.0,1.0,1.0


In [89]:
columns_to_include = ['age', 'resting_blood_pressure', 'cholestoral', 'Max_heart_rate', 'oldpeak']

In [162]:
# mark boxplots on parallel coordinates plot
def plot_parallel_coordinates(df, quantiles, column_mapping):

    color_map = {0: 'no', 1: 'yes'}

    fig = px.scatter(
        df,
        x='age',
        y='cholestoral',
        color=df['target'].map(color_map),
        color_discrete_map=color_map,
        title='Cholesterol Levels by Age and Heart Disease Status'
    )

    fig.update_traces(marker=dict(size=5))

    fig.update_layout(
        legend_title_text='Heart Disease',
        xaxis_title='Age',
        yaxis_title='Cholesterol'
    )
    fig.update_layout(legend_traceorder='normal')

    # change legend titles -> green = no heart disease, red = heart disease
    fig.update_traces(
        marker=dict(
            size=9,
            opacity=0.2,
        )
    )

    # change color labels
    # change theme
    # fig.update_layout(template='plotly_dark')

    # save
    fig.write_image("eda/parallel_coordinates_plot.png")

    fig.show()

# plot parallel coordinates with boxplots
plot_parallel_coordinates(heart_disease_df, statistics, column_mapping)

1. **Age**:
- The age range spans from 29 to 77 years.
- There is no clear distinction in age distribution between individuals with and without heart disease.

2. **Resting Blood Pressure**:
- Values range from 94 to 200 mmHg.
- Individuals with heart disease (red lines) tend to have higher resting blood pressure compared to those without heart disease (blue lines).

3. **Cholesterol**:
- Cholesterol levels range from 126 to 564 mg/dL.
- There is a wide spread of cholesterol levels for both groups, but individuals with heart disease show a tendency towards higher cholesterol levels.

4. **Max Heart Rate**:
- The maximum heart rate achieved ranges from 71 to 202 bpm.
- Individuals with heart disease generally have lower maximum heart rates compared to those without heart disease.

5. **Oldpeak**:
- Oldpeak values range from 0 to 6.2.
- Individuals with heart disease tend to have higher Oldpeak values, indicating more significant ST depression during exercise.

In [91]:
# show boxplots for selected columns, separate plots
def plot_boxplots(df, columns, column_mapping):
    for col in columns:
        fig = px.box(
            df,
            x='target',
            y=col,
            color='target',
            title=f'Boxplot of {column_mapping[col]} by Target',
            labels=column_mapping
        )

        # update layout
        fig.update_layout(
            xaxis_title='Heart Disease',
            yaxis_title=column_mapping[col],
        )

        # save the plot
        fig.write_image(f"eda/boxplots/{col}.png")

        fig.show()

# plot boxplots
plot_boxplots(heart_disease_df, columns_to_include, column_mapping)


### 1. Boxplot of Age by Heart Disease

- **Heart Disease (0)**: The median age is around 55, with the interquartile range (IQR) extending from approximately 48 to 60. There is one outlier below 40.
- **Heart Disease (1)**: The median age is slightly higher, around 58, with the IQR extending from approximately 52 to 65. The age range is broader, extending from around 30 to 77.
- **Observation**: Individuals with heart disease tend to be slightly older on average compared to those without heart disease.

### 2. Oldpeak by Heart Disease
- **Oldpeak** measures ST depression induced by exercise relative to rest.
- **Heart Disease (0)**: The median Oldpeak value is around 1, with the interquartile range (IQR) extending from approximately 0.5 to 2. There are a few outliers above 4.
- **Heart Disease (1)**: The median Oldpeak value is lower, around 0.5, with the IQR extending from approximately 0 to 1.5. There are several outliers above 2.
- **Observation**: Individuals without heart disease tend to have higher Oldpeak values compared to those with heart disease.

### 3. Max Heart Rate by Heart Disease
- **Max Heart Rate** measures the maximum heart rate achieved during exercise.
- **Heart Disease (0)**: The median max heart rate is around 150, with the IQR extending from approximately 130 to 170. There are a few outliers below 100.
- **Heart Disease (1)**: The median max heart rate is lower, around 140, with the IQR extending from approximately 120 to 160. There are several outliers below 120.
- **Observation**: Individuals with heart disease tend to have lower maximum heart rates compared to those without heart disease.

### 4. Resting Blood Pressure by Heart Disease
- **Resting Blood Pressure** measures the blood pressure at rest.
- **Heart Disease (0)**: The median resting blood pressure is around 130, with the IQR extending from approximately 120 to 140. There are a few outliers above 180.
- **Heart Disease (1)**: The median resting blood pressure is higher, around 140, with the IQR extending from approximately 130 to 160. There are several outliers above 180.
- **Observation**: Individuals with heart disease tend to have higher resting blood pressure compared to those without heart disease.

### 5. Cholesterol by Heart Disease
- **Cholesterol** measures the cholesterol level in the blood.
- **Heart Disease (0)**: The median cholesterol level is around 240, with the IQR extending from approximately 210 to 270. There are a few outliers above 400.
- **Heart Disease (1)**: The median cholesterol level is slightly lower, around 230, with the IQR extending from approximately 200 to 260. There are several outliers above 300.
- **Observation**: Individuals with heart disease tend to have slightly lower cholesterol levels compared to those without heart disease, but the difference is not as pronounced as in other metrics.

### Summary
- **Oldpeak** and **Max Heart Rate** show a clear distinction between individuals with and without heart disease, with higher Oldpeak and lower max heart rates associated with heart disease.
- **Resting Blood Pressure** is higher in individuals with heart disease.
- **Cholesterol** levels are slightly lower in individuals with heart disease, but the difference is less significant.


## Pairplot

In [92]:
df = heart_disease_df

In [176]:
# create subplots for scatter plots
import plotly.express as px
from plotly.subplots import make_subplots

# Create subplots
fig = make_subplots(rows=2, cols=2, subplot_titles=('Age vs Maximum Heart Rate', 'Age vs Cholesterol', 'Age vs Resting Blood Pressure', 'Age vs Oldpeak'))

# create function to add traces
def add_trace(fig, df, x_col, y_col, row, col):
    fig.add_trace(go.Scatter(x=df[x_col], y=df[y_col], mode='markers'), row=row, col=col)
    # update axes titles
    fig.update_xaxes(title_text=x_col, row=row, col=col)
    fig.update_yaxes(title_text=y_col, row=row, col=col)
    # remove 'trace 0' from tooltip
    fig.update_traces(hoverinfo='x+y')


#  Columns to include in the pairplot
columns_to_include = ['age', 'resting_blood_pressure', 'cholestoral', 'Max_heart_rate', 'oldpeak']

# Create subplots
num_cols = len(columns_to_include)
fig = make_subplots(rows=num_cols, cols=num_cols, 
                    shared_xaxes=True, shared_yaxes=False, 
                    vertical_spacing=0.02, horizontal_spacing=0.02)

# Add scatter plots and histograms to the subplots
for i, col1 in enumerate(columns_to_include):
    for j, col2 in enumerate(columns_to_include):
        if i == j:
            # Add histogram on the diagonal
            fig.add_trace(go.Histogram(x=df[col1], name=col1, showlegend=False), row=i+1, col=j+1)
        else:
            # Add scatter plot for off-diagonal
            fig.add_trace(go.Scatter(x=df[col2], y=df[col1], mode='markers', 
                                     marker=dict(color=df['target'], colorscale='Tealrose', showscale=False),
                                     showlegend=False), row=i+1, col=j+1)
            # change hover tooltip to x_name: {x}<br>y_name: {y}

        # Add axis labels (uses mapping to get full column names)
        if i == num_cols - 1:
            fig.update_xaxes(title_text=column_mapping[col2], row=i+1, col=j+1)
            # change hover tooltip
            fig.update_traces(hoverinfo='x+y')
            # show xaxis values and tick labels
            fig.update_xaxes(showticklabels=True)
        if j == 0:
            fig.update_yaxes(title_text=column_mapping[col1], row=i+1, col=j+1)

# Update layout
fig.update_layout(height=1500, width=1500, title_text="Pairplot of Selected Variables")

# Show the plot
fig.show()


In [177]:
# save the plot to html, use cdn
# fig.write_html("eda/pairplot.html", include_plotlyjs='cdn')
# save to png
fig.write_image("eda/pairplot.png")
