In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [None]:
df = pd.read_csv('../data/raw/heart.csv')
df.head()

From reading the Kaggle details on the dataset: 
- sex = binary, 
- cp (chest pain type) = categorical, <--- one hot
- fbs (fasting blood sugar > 120mg/dl) = binary, 
- restecg (resting ECG results) = categorical, <--- one hot
- exang (exercise-induced angina) = binary
- thal (0 = normal; 1 = fixed defect; 2 = reversable defect) = categorical <--- one hot



In [None]:
# one-hot encode the appropriate columns, converting the dummies from bools to ints
one_hot_cols = ['cp', 'restecg', 'thal']
df_encoded = pd.get_dummies(df.copy(), columns=one_hot_cols, dtype=int)
df_encoded.head()

In [None]:
def normalise(col: pd.Series) -> pd.Series:
    return (col - col.mean())/col.std()

df_encoded_normalised = df_encoded.copy()

# find all columns where the data is not binary and normalise it
for col in df_encoded_normalised.columns:
    # if binary column, don't normalise
    # this will avoid columns that have been one-hot encoded or were already binary
    if np.isin(df_encoded_normalised[col].dropna().unique(), [0, 1]).all():
        continue
    else:
        df_encoded_normalised[col] = normalise(df_encoded_normalised[col])

df_encoded_normalised.info()


In [None]:
df_encoded_normalised.to_csv('../data/preprocessed/heart_preprocessed.csv')

### VIF Analysis
After performing a basic linear regression, I found the model a poor fit to the data ($R^2=0.42$). So I am performing further analysis to understand what I can do to improve the performance, starting with VIF analysis.

In [None]:
type DF = pd.DataFrame
# Function to check VIF
def checking_vif(inputDataFrame: DF) -> DF:
    vif = pd.DataFrame()
    vif["feature"] = inputDataFrame.columns

    # Calculating VIF for each feature
    vif["VIF"] = [
        variance_inflation_factor(inputDataFrame.values, i) for i in range(len(inputDataFrame.columns))
    ]
    return vif

In [None]:
# To avoid multicollinearity in one-hot encoded data, drop the first column of each one-hot encoded category
df_cols_dropped_for_vif = df_encoded_normalised.drop(columns=[
    c + '_0' for c in one_hot_cols
])
df_cols_dropped_for_vif.info()

In [None]:
# VIF Analysis
checking_vif(df_cols_dropped_for_vif)

In [None]:
print(
    df_cols_dropped_for_vif[['thal_2', 'thal_3']].corr()
)

Observations: There is fairly strong anticorrelation between thal_2 and thal_3 columns.