In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
heart_data = pd.read_csv('/kaggle/input/heart-disease-data/heart_disease_uci.csv')
heart_data.head()

# Exploratory Data Analysis

In [None]:
# Drop the id and dataset columns

heart_data.drop(['id','dataset'], axis=1, inplace=True)
heart_data.info()

In [None]:
# Display descriptive statistics
heart_data.describe()

In [None]:
# Separate numeric and categorical variables for visualization purposes
CATEGORICAL_COLS = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'ca']
NUMERICAL_COLS = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']

heart_cat = heart_data[CATEGORICAL_COLS]
heart_num = heart_data[NUMERICAL_COLS]

heart_cat.nunique()

In [None]:
# Visualize the distribution of categorical variables 
fig, axes = plt.subplots(2, 4, figsize=(20,10))

sns.countplot(x='sex', data=heart_cat, ax=axes[0,0])
axes[0,0].set_title('Gender Distribution')

sns.countplot(x='cp', data=heart_cat, ax=axes[0,1])
axes[0,1].tick_params(axis='x', rotation=45)
axes[0,1].set_title('Chest Pain Types')

sns.countplot(x='fbs', data=heart_cat, ax=axes[0,2])
axes[0,2].set_title('Fasting Blood Sugar > 120 mg/dl')

sns.countplot(x='restecg', data=heart_cat, ax=axes[0,3])
axes[0,3].set_title('Resting Electrocardiographic Results')

sns.countplot(x='exang', data=heart_cat, ax=axes[1,0])
axes[1,0].set_title('Exercise Induced Angina')

sns.countplot(x='slope', data=heart_cat, ax=axes[1,1])
axes[1,1].set_title('Slope of the Peak Exercise ST Segment')

sns.countplot(x='thal', data=heart_cat, ax=axes[1,2])
axes[1,2].set_title('Defects')

sns.countplot(x='ca', data=heart_cat, ax=axes[1,3])
axes[1,3].set_title('Number of Major Vessels colored by Fluoroscopy')
plt.tight_layout()
plt.show()

In [None]:
# Use scatterplots to visualize key relationships in numerical data
fig, axes = plt.subplots(2, 2, figsize=(10,10))

heart_num.plot('age', 'chol', kind='scatter', ax=axes[0,0])
axes[0,0].set_title('Age Against Cholesterol Levels')

heart_num.plot('age', 'trestbps', kind='scatter', ax=axes[0,1])
axes[0,1].set_title('Age Against Resting Blood Pressure')

heart_num.plot('age', 'thalch', kind='scatter', ax=axes[1,0])
axes[1,0].set_title('Age Against Maximum Heart Rate Achieved')

heart_num.plot('age', 'oldpeak', kind='scatter', ax=axes[1,1])
axes[1,1].set_title('Age Against ST Depression')

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(3, figsize=(7,10))

sns.scatterplot(x='chol', y='thalch', hue='num', data=heart_data, ax=axes[0])
axes[0].set_title('Affect of Cholesterol on Maximum Heart Rate')

sns.scatterplot(x='chol', y='thalch', hue='sex', data=heart_data, ax=axes[1])

sns.scatterplot(x='chol', y='thalch', hue='restecg', data=heart_data, ax=axes[2])
plt.show()

In [None]:
sns.scatterplot(x='trestbps', y='thalch', hue='restecg', data=heart_data)
plt.show()

In [None]:
fig, axes = plt.subplots(3, figsize=(7,10))

axes[0].set_title('Affect of Cholesterol on Resting Blood Pressure')
sns.scatterplot(x='chol', y='trestbps', hue='num', data=heart_data, ax=axes[0])
sns.scatterplot(x='chol', y='trestbps', hue='sex', data=heart_data, ax=axes[1])
sns.scatterplot(x='chol', y='trestbps', hue='restecg', data=heart_data, ax=axes[2])

plt.tight_layout()
plt.show()

In [None]:
heart_data.groupby('num').mean()

In [None]:
print('Average Cholesterol Level Based on Target Variable and Chest Pain Type')
print(pd.crosstab(index=heart_data.num, columns=heart_data.cp, values=heart_data.chol, aggfunc=np.mean))
print('\n')

print('Average Cholesterol Level Based on Target Variable and Patient Gender')
print(pd.crosstab(index=heart_data.num, columns=heart_data.sex, values=heart_data.chol, aggfunc=np.mean))
print('\n')

print('Average Cholesterol Level Based on Target Variable and Cardiographic Results')
print(pd.crosstab(index=heart_data.num, columns=heart_data.restecg, values=heart_data.chol, aggfunc=np.mean))

In [None]:
# Display correlation matrix and heatmap
corr = heart_data.corr()
print(corr)

sns.heatmap(corr)
plt.show()

In [None]:
# Display boxplot to visualize outliers in the data

heart_data.boxplot()
plt.show()

In [None]:
heart_data.loc[heart_data['chol']==0,:]

# Data Cleaning

In this section we will drop columns with a considerable amount of missing data as well as impute data where necessary. We do not wish to drop any outliers beyond logical reason. For example, we may not drop or impute values for patients with cholesterol levels ~500 since such high values make sense for the given data set. However, patients with cholesterol levels at 0 may be erroneous entries.

In [None]:
heart_df.info()

In [None]:
# Cholesterol Levels

median_chol = heart_data.loc[heart_data['chol']!=0, 'chol'].median()
heart_df = heart_data.fillna(value={'chol': median_chol})
heart_df.loc[heart_df['chol']==0, 'chol'] = median_chol 

In [None]:
# Resting Blood Pressure

mean_bp = heart_df.loc[heart_df['trestbps']!=0,'trestbps'].mean()
heart_df = heart_df.fillna(value={'trestbps': mean_bp})
heart_df.loc[heart_df['trestbps']==0, 'trestbps'] = mean_bp 

In [None]:
# Maximum Heart Rate

mean_hr = heart_df.loc[heart_df['thalch']!=0,'thalch'].mean()
heart_df = heart_df.fillna(value={'thalch': mean_hr})
heart_df.loc[heart_df['thalch']==0, 'thalch'] = mean_hr

In [None]:
# Old Peak

mean_peak = heart_df.oldpeak.mean()
heart_df = heart_df.fillna(value={'oldpeak': mean_peak})
heart_df.loc[heart_df['oldpeak']==0, 'oldpeak'] = mean_peak

In [None]:
# Drop columns with a great number of missing values and reassign datatypes

heart_df.drop(labels=['ca','thal','slope'], axis=1, inplace=True)
heart_df = heart_df.astype({'sex':'category', 'cp':'category', 'fbs':'bool', 'restecg':'category', 'exang':'bool'})

# Drop remaining rows with missing values and display distribution for target variables

heart_df.dropna(inplace=True)
sns.countplot('num', data=heart_df)
plt.show()

As we can see, our dataset is greatly imbalanced. We must be mindful that the accuracy of the models we implement will be misleading. We will explore several methods for dealing with imbalanced data including SMOTE oversampling and the adjusting of Class Weights to tackle this issue

## Preparing the Data for Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# One hot encode the categorical variables and split the target and independent variables
heart_onehot = pd.get_dummies(heart_df, columns=['sex','cp', 'fbs', 'restecg', 'exang'])

X = heart_onehot.drop('num', axis=1)
y = heart_onehot.num

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train.value_counts()

In [None]:
heart_onehot.info()

## Decision Tree Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [None]:
weights = {0:1, 1:0.5, 2:0.5, 3:0.5, 4:0.5}

clf = DecisionTreeClassifier(criterion='entropy', max_depth=5)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
# Perform Decision Tree model with class weighting
weights = {0:1, 1:0.5, 2:0.5, 3:0.5, 4:0.5}

clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, class_weight='balanced')
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

## Gradient Boosting

In [None]:
gradient_booster = GradientBoostingClassifier(learning_rate=0.02, max_depth=3, n_estimators=150)
gradient_booster.fit(X_train, y_train)
y_pred = gradient_booster.predict(X_test)

print(classification_report(y_test, y_pred))

## Random Forest Classifier

In [None]:
clf = RandomForestClassifier(n_estimators=150)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
clf = RandomForestClassifier(n_estimators=150, class_weight='balanced_subsample')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

## Decision Tree With SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smt = SMOTE(sampling_strategy='not majority')

print('Before', y_train.value_counts())

X_train_SM, y_train_SM = smt.fit_resample(X_train, y_train)

val, counter = np.unique(y_train_SM, return_counts=True)
print('After', (val, counter))

In [None]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=6)
clf.fit(X_train_SM, y_train_SM)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

# Conclusion
Imbalanced multi-class classification problems can prove to be a challenge for data scientists looking to provide accurate models for real-world problems. Using accuracy is misleading in this case as it does not describe the performance of our model across each of the target variables. The f1 score is a more reliable metric as it calculates the percentage of positive predictions that are correct for each target variable.

The two techniques used for our models (SMOTE and Class-Weighting) did not show major improvements to our results. We note that the Decision Tree with SMOTE performed worse than the original model. Class-Weighting did not raise the accuracy of our model, however we notice a slight improvement in the f1 score of the minority classes. 

***Moving Forward***\
After running these models, there are several steps we can take to improve our performance. I would love to use hyperparameter tuning methods such as GridSearch or Bayesian Optimization to fine tune the models. I believe that class weighting is a powerful tool and I plan on conducting a survey centered around classification models for various weights. While the results for SMOTE were disappointing, I believe it was due to the lack of original data for the minority classes (there were roughly 20 patients identified with a type 4 heart disease). Lack of data on such a scale would surely diminish the effectiveness of SMOTE. I am curious to see how SMOTE performs on a slightly larger dataset. I would also like to see how the model would performs with oversampling + cross validation.

As a parting notion, I believe that reworking this dataset into a binary classification problem would yield more promising results and is an exercise that I wish to tackle in the near future. If you've made it to this point, thanks for joining me on my data science journey. Feel free to leave any criticisms or suggestions as all feedback is welcome!