# CARDIOVASCULAR DISEASE - CLASSIFICATION TECHNIQUES

This is a dataset containing 12 explanatory variables describing patients.
data set available at this link: https://www.kaggle.com/datasets/sulianova/cardiovascular-disease-dataset

Here I will compare several classification techniques:

Random Forest model
SVC model and GridsearchCV
KNN model


Data features:

Age | Objective Feature | age | int (days)

Height | Objective Feature | height | int (cm) |

Weight | Objective Feature | weight | float (kg) |

Gender | Objective Feature | gender | categorical code |

Systolic blood pressure | Examination Feature | ap_hi | int |

Diastolic blood pressure | Examination Feature | ap_lo | int |

Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |

Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |

Smoking | Subjective Feature | smoke | binary |

Alcohol intake | Subjective Feature | alco | binary |

Physical activity | Subjective Feature | active | binary |

Presence or absence of cardiovascular disease | Target Variable | cardio | binary |

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train = pd.read_csv('cardio_train.csv',delimiter = ';',index_col= 'id')
train.head(n = 10).style.background_gradient(cmap = "Oranges")

In [None]:
train.info()

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(train.isnull(),cmap='viridis')

Adding two useful columns: body mass index and blood pressure category

In [None]:
train.insert(8, 'bmi', round((train['weight']/(train['height']/100)**2), 2))

In [None]:
def BPCategorize(x,y):
    if x<=120 and y<=80:
        return 0
    elif x<=129 and y<=80:
        return 1
    elif x<=139 or y<=89:
        return 2
    elif x<=180 or y<=120:
        return 3
    elif x>180 or y>120:
        return 4
    else:
        return None
    
train.insert(8, "bp_cat", train.apply(lambda row: BPCategorize(row['ap_hi'], row['ap_lo']), axis=1))
train['bp_cat'].value_counts()

In [None]:
train

In [None]:
train.duplicated().sum()

In [None]:
train.drop_duplicates(inplace=True)

In [None]:
train.describe().T.style.background_gradient(cmap = "magma")

In [None]:
train.corr().style.background_gradient(cmap = "magma")

In [None]:
plt.figure(figsize = [20, 10], clear = True, facecolor = "white")
sns.heatmap(train.corr(), annot = True, square = False, linewidths = 3,linecolor = "white", cmap = "Set2");

In [None]:
train['age'] = train['age']/365   #converting the age in years

In [None]:
train['age'].describe()

In [None]:
plt.figure(figsize = [10, 10], clear = True)
sns.histplot(y='age',data=train,bins=20,hue='gender',multiple='dodge',shrink=.6)

In [None]:
train['height'].describe()

In [None]:
train['height'].describe()

In [None]:
train['ap_hi'].describe()

In [None]:
#keeping range from 0 to 220
train = train[(train['ap_hi'] > 0) & (train['ap_hi']<=220)]

In [None]:
plt.figure(figsize = [8, 4], clear = True)
sns.histplot(y='ap_hi',data=train,bins=10,hue='gender',multiple='dodge',shrink=.6)

In [None]:
train['ap_lo'].describe()

In [None]:
#keeping range from 0 to 200
train = train[(train['ap_lo'] > 0) & (train['ap_lo']<=200)]

In [None]:
plt.figure(figsize = [8, 4], clear = True)
sns.histplot(y='ap_lo',data=train,bins=10,hue='gender',multiple='dodge',shrink=.6)

In [None]:
sns.pairplot(train, diag_kind = "hist", height = 4, aspect = 1, corner = True);

In [None]:
train.hist(figsize = (20, 20), bins = 12, legend = False);

In [None]:
plt.figure(figsize = [10, 4], clear = True)
fig = px.histogram(train, x = "age",
                   y = "smoke",                   
                   marginal = "box",
                   color = "gender", hover_data  = train.columns)
fig.show()


In [None]:
plt.figure(figsize = [20, 20], clear = True)
fig = px.density_heatmap(train, x = "active", y = "age", z = "cardio",
                        color_continuous_scale = "picnic", text_auto = True)
fig.show()

In [None]:
sns.catplot(x = "gender",
            y = "age",
            hue = "cardio",
            kind = "boxen",
            color = '#B3EE22',
            data = train, saturation = 1, height = 4, aspect = 1.3,
            margin_titles = True).set(title = "cardio by gender and age");

In [None]:
sns.catplot(x = "gender",
            y = "age",
            hue = "smoke",
            kind = "boxen",
            color = '#468A85',
            data = train, saturation = 1, height = 4, aspect = 1.3,
            margin_titles = True).set(title = "smoke by gender and age");

In [None]:
fig = px.scatter_3d(train, 
                    x = "weight",
                    y = "age",
                    z = "smoke",
                    color="cardio")
fig.show();

In [None]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler #for standardization
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
!pip install prettytable
from prettytable import PrettyTable
# to ignore warnings
import warnings
warnings.filterwarnings("ignore")

Select Dependent (label) and Independent (estimator) Variables

In [None]:
# select dependent variable (label)
y = train["cardio"]

# select independent variable (estimator)
X = train.drop("cardio", axis = 1)

Split the Dataset into Train and Test Sets


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

Standardization process

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Random Forest model

In [None]:
rf_model = RandomForestClassifier(n_estimators = 1000,max_depth=10,random_state=0)
rf_model.fit(X_train, y_train)

Classification report of model

In [None]:
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

sns.heatmap(conf_mat, square = True, annot = True, robust = True)
plt.show()

In [None]:
plot_confusion_matrix(rf_model, X_test, y_test, cmap = plt.cm.Blues, normalize = "true")

Support Vector Classifier model


In [None]:
svc_model = SVC()

In [None]:
svc_model.fit(X_train,y_train)

Classification report of model

In [None]:
y_pred = svc_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

sns.heatmap(conf_mat, square = True, annot = True, robust = True)
plt.show()

In [None]:
plot_confusion_matrix(svc_model, X_test, y_test, cmap = plt.cm.Blues, normalize = "true")

In [None]:
param_grid={'C':[100,150],'gamma':[0.0001,0.00001]}


In [None]:
grid = GridSearchCV(SVC(),param_grid,verbose=20)
grid.fit(X_train,y_train)

Classification report of model

In [None]:
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))

In [None]:
conf_mat = confusion_matrix(y_test, grid_predictions)
print(conf_mat)

sns.heatmap(conf_mat, square = True, annot = True, robust = True)
plt.show()

In [None]:
plot_confusion_matrix(grid, X_test, y_test, cmap = plt.cm.Blues, normalize = "true")

K Neighbors Classifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
knn.fit(X_train,y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

sns.heatmap(conf_mat, square = True, annot = True, robust = True)
plt.show()

In [None]:
plot_confusion_matrix(knn, X_test, y_test, cmap = plt.cm.Blues, normalize = "true")

# Summary of results:
Random Forest model: Accuracy= 74%

SVC model: Accuracy: Accuracy= 72%

SVC model with GridsearchCV: Accuracy= 71%

KNN model: Accuracy= 64%