### Importing the necessary libraries.

In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

### Reading the data and checking out.

In [4]:
data = pd.read_csv('CVD_cleaned.csv')

In [5]:
data.describe()

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
count,308854.0,308854.0,308854.0,308854.0,308854.0,308854.0,308854.0
mean,170.615249,83.588655,28.626211,5.096366,29.8352,15.110441,6.296616
std,10.658026,21.34321,6.522323,8.199763,24.875735,14.926238,8.582954
min,91.0,24.95,12.02,0.0,0.0,0.0,0.0
25%,163.0,68.04,24.21,0.0,12.0,4.0,2.0
50%,170.0,81.65,27.44,1.0,30.0,12.0,4.0
75%,178.0,95.25,31.85,6.0,30.0,20.0,8.0
max,241.0,293.02,99.33,30.0,120.0,128.0,128.0


In [6]:
data.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


### Finding whether there are missing values or not. 

In [7]:
data.isnull().describe()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
count,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854
unique,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
top,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
freq,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854,308854


## Data Cleaning Part
#### Since BMI tells us the ratio between height and weight, we can safely remove height and weight columns and use the BMI values in advance.

In [8]:
data = data.drop(['Height_(cm)', 'Weight_(kg)'], axis=1)

### Preparing the categorical data in order to encode.

In [9]:
list_of_categories = ["General_Health", "Checkup", "Exercise", "Heart_Disease", "Skin_Cancer",
"Other_Cancer", "Depression", "Diabetes", "Arthritis", "Sex", "Age_Category",
"Smoking_History"]

### Then we transform our categorical data into new encoded data.

In [10]:
def transform_categorical_columns(df, categorical):
    ohe = OneHotEncoder(sparse_output=False)
    le = LabelEncoder()
    empty_dataframe = pd.DataFrame()
    for i in categorical:
        #If it just contains two variables, then it is best to use LE.
        if df[i].nunique() == 2:
            temporary = pd.DataFrame(le.fit_transform(np.array(df[i])), columns=[i])
            empty_dataframe = pd.concat([empty_dataframe, temporary], axis=1)
            df.drop([i], axis=1, inplace=True)
        #Else, it's OHE.    
        else:
            temporary = pd.DataFrame(ohe.fit_transform(np.array(df[i]).reshape(-1, 1)))
            temporary.columns = ohe.get_feature_names_out([i])
            empty_dataframe = pd.concat([empty_dataframe, temporary], axis=1)
            df.drop([i], axis=1, inplace=True)
    return pd.concat([empty_dataframe, df], axis=1)

In [11]:
#This is the final form of our data.
final_dataframe = transform_categorical_columns(data, list_of_categories)

In [12]:
final_dataframe.head()

Unnamed: 0,General_Health_Excellent,General_Health_Fair,General_Health_Good,General_Health_Poor,General_Health_Very Good,Checkup_5 or more years ago,Checkup_Never,Checkup_Within the past 2 years,Checkup_Within the past 5 years,Checkup_Within the past year,...,Age_Category_65-69,Age_Category_70-74,Age_Category_75-79,Age_Category_80+,Smoking_History,BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1,14.54,0.0,30.0,16.0,12.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0,28.29,0.0,30.0,0.0,4.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0,33.47,4.0,12.0,3.0,16.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0,28.73,0.0,30.0,30.0,8.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1,24.37,0.0,8.0,4.0,0.0


### Getting prepared for the training.

In [13]:
#We seperate our x and y.
y = final_dataframe['Heart_Disease']
x = final_dataframe.drop(['Heart_Disease'], axis=1)

In [14]:
x.shape, y.shape

((308854, 39), (308854,))

### Now, we're gonna check the balance between two categorical values. If there is no balance, we have to adjust them.

In [15]:
value_zero = final_dataframe[final_dataframe['Heart_Disease'] == 0]
value_one = final_dataframe[final_dataframe['Heart_Disease'] == 1]

In [16]:
value_zero.shape, value_one.shape

((283883, 40), (24971, 40))

### It's clearly imbalanced, I'm gonna use two different techniques:
1. Populate the imbalanced data using SMOTE and use K-Fold on the KNN balanced data.
2. Use Stratified K-Fold on the imbalanced data.

In [25]:
xgb = XGBClassifier()

### 1. Populating the data by SMOTE and using K-Fold.

In [22]:
smote = SMOTE(sampling_strategy='minority')
X_sy, y_sy = smote.fit_resample(x, y)

In [23]:
X_sy.shape, y_sy.shape

((567766, 39), (567766,))

In [24]:
value_zero_sm = y_sy[y_sy.iloc[:] == 0]
value_one_sm = y_sy[y_sy.iloc[:] == 1]

value_zero_sm.shape, value_one_sm.shape

((283883,), (283883,))

In [28]:
kfold = KFold(n_splits=10)
results = cross_val_score(xgb, X_sy, y_sy, cv=kfold)

### Then we get our first result.

In [29]:
print("Accuracy (with SMOTE and 10K-Fold): ", results.mean())

Accuracy(with SMOTE and 10K-Fold):  0.9457913016955366


### 2. Using Stratified K-Fold on our imbalanced data.

In [30]:
value_zero.shape, value_one.shape

((283883, 40), (24971, 40))

In [31]:
x.shape, y.shape

((308854, 39), (308854,))

In [32]:
skfold = StratifiedKFold(n_splits=10)
results_skfold = cross_val_score(xgb, x, y, cv=skfold)

### We get our last result.

In [None]:
print("Accuracy (with Stratified 10K-Fold): ", results_skfold.mean())