- Load the data

In [18]:
import pandas as pd

data = pd.read_csv('data/heart_2020_cleaned.csv')

- Transform the data

In [19]:
data['HeartDisease'] = data['HeartDisease'].replace({'Yes': 1, 'No': 0})

data['Smoking'] = data['Smoking'].replace({'Yes': 1, 'No': 0})

data['AlcoholDrinking'] = data['AlcoholDrinking'].replace({'Yes': 1, 'No': 0})

data['Stroke'] = data['Stroke'].replace({'Yes': 1, 'No': 0})

data['DiffWalking'] = data['DiffWalking'].replace({'Yes': 1, 'No': 0})
sex_mapping = {
    'Male': 0,
    'Female': 1
}
data['Sex'] = data['Sex'].map(sex_mapping)

age_mapping = {
    '18-24': 21,
    '25-29': 27,
    '30-34': 32,
    '35-39': 37,
    '40-44': 42,
    '45-49': 47,
    '50-54': 52,
    '55-59': 57,
    '60-64': 62,
    '65-69': 67,
    '70-74': 72,
    '75-79': 77,
    '80 or older': 85
}
data['AgeCategory'] = data['AgeCategory'].map(age_mapping)

data = data[data['Race'] != 'American Indian/Alaskan Native']
data = pd.get_dummies(data, columns=['Race'], prefix=['Race'])

data = data[data['Diabetic'].isin(['Yes', 'No'])]
data['Diabetic'] = data['Diabetic'].replace({'Yes': 1, 'No': 0})

data['PhysicalActivity'] = data['PhysicalActivity'].replace({'Yes': 1, 'No': 0})

health_mapping = {
    'Excellent': 5,
    'Very good': 4,
    'Good': 3,
    'Fair': 2,
    'Poor': 1
}
data['GenHealth'] = data['GenHealth'].map(health_mapping)

data['Asthma'] = data['Asthma'].replace({'Yes': 1, 'No': 0})

data['KidneyDisease'] = data['KidneyDisease'].replace({'Yes': 1, 'No': 0})

data['SkinCancer'] = data['SkinCancer'].replace({'Yes': 1, 'No': 0})

- Separate between objective and independent variables

In [None]:
char = data.drop(columns=['HeartDisease'])
obj = data['HeartDisease']

- Divide the data between train and test data

In [None]:
from sklearn.model_selection import train_test_split

char_train, char_test, obj_train, obj_test = train_test_split(char, obj, test_size=0.2, random_state=42)


- Normalize variables

In [None]:
from sklearn.preprocessing import MinMaxScaler

columns_to_normalize = data.select_dtypes(include=['float64']).columns
scaler = MinMaxScaler()
data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])

In [None]:
print(data.head())

   HeartDisease       BMI  Smoking  AlcoholDrinking  Stroke  PhysicalHealth  \
0             0  0.055294        1                0       0        0.100000   
1             0  0.100447        0                0       1        0.000000   
2             0  0.175782        1                0       0        0.666667   
3             0  0.147169        0                0       0        0.000000   
4             0  0.141132        0                0       0        0.933333   

   MentalHealth  DiffWalking  Sex  AgeCategory  ...  GenHealth  SleepTime  \
0           1.0            0    1           57  ...          4   0.173913   
1           0.0            0    1           85  ...          4   0.260870   
2           1.0            0    0           67  ...          2   0.304348   
3           0.0            0    1           77  ...          3   0.217391   
4           0.0            1    1           42  ...          4   0.304348   

   Asthma  KidneyDisease  SkinCancer  Race_Asian  Race_Black  

- K-nn model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

k = 20  
knn_model = KNeighborsClassifier(n_neighbors=k)

# Entrenamiento del modelo
knn_model.fit(char_train, obj_train)

# Evaluación del modelo
score = knn_model.score(char_test, obj_test)
print(f'Accuracy of the k-NN model: {score:.2f}')


Accuracy of the k-NN model: 0.91
