- Load the data

In [556]:
import pandas as pd

data = pd.read_csv('data/heart_2020_cleaned.csv')

- Transform the data

In [557]:
data['HeartDisease'] = data['HeartDisease'].replace({'Yes': 1, 'No': 0})

data['Smoking'] = data['Smoking'].replace({'Yes': 1, 'No': 0})

data['AlcoholDrinking'] = data['AlcoholDrinking'].replace({'Yes': 1, 'No': 0})

data['Stroke'] = data['Stroke'].replace({'Yes': 1, 'No': 0})

data['DiffWalking'] = data['DiffWalking'].replace({'Yes': 1, 'No': 0})
sex_mapping = {
    'Male': 0,
    'Female': 1
}
data['Sex'] = data['Sex'].map(sex_mapping)

age_mapping = {
    '18-24': 21,
    '25-29': 27,
    '30-34': 32,
    '35-39': 37,
    '40-44': 42,
    '45-49': 47,
    '50-54': 52,
    '55-59': 57,
    '60-64': 62,
    '65-69': 67,
    '70-74': 72,
    '75-79': 77,
    '80 or older': 85
}
data['AgeCategory'] = data['AgeCategory'].map(age_mapping)

data = data[data['Race'] != 'American Indian/Alaskan Native']
data = pd.get_dummies(data, columns=['Race'], prefix=['Race'])

data = data[data['Diabetic'].isin(['Yes', 'No'])]
data['Diabetic'] = data['Diabetic'].replace({'Yes': 1, 'No': 0})

data['PhysicalActivity'] = data['PhysicalActivity'].replace({'Yes': 1, 'No': 0})

health_mapping = {
    'Excellent': 5,
    'Very good': 4,
    'Good': 3,
    'Fair': 2,
    'Poor': 1
}
data['GenHealth'] = data['GenHealth'].map(health_mapping)

data['Asthma'] = data['Asthma'].replace({'Yes': 1, 'No': 0})

data['KidneyDisease'] = data['KidneyDisease'].replace({'Yes': 1, 'No': 0})

data['SkinCancer'] = data['SkinCancer'].replace({'Yes': 1, 'No': 0})

In [558]:
import random

data_copia = data.copy()

num_rows_to_remove = 255000

rows_to_remove = data_copia[data_copia['HeartDisease'] == 0].index
rows_to_remove = random.sample(list(rows_to_remove), num_rows_to_remove)
data_copia = data_copia.drop(rows_to_remove)

- Separate between objective and independent variables

In [559]:
char = data_copia.drop(columns=['HeartDisease'])
obj = data_copia['HeartDisease']

- Divide the data between train and test data

In [560]:
from sklearn.model_selection import train_test_split

char_train, char_test, obj_train, obj_test = train_test_split(char, obj, test_size=0.2, random_state=42)


- Normalize variables

In [561]:
from sklearn.preprocessing import MinMaxScaler

columns_to_normalize = data_copia.select_dtypes(include=['float64']).columns
scaler = MinMaxScaler()
data_copia[columns_to_normalize] = scaler.fit_transform(data_copia[columns_to_normalize])

- Show statistics

In [562]:
# Filtra las filas donde HeartDisease es igual a 1
filtered_data = data_copia[data_copia['HeartDisease'] == 1]

for column in filtered_data.columns:
    if column == 'HeartDisease':
        continue  # Saltar la columna 'HeartDisease' en el bucle
    
    value_percentages = filtered_data[column].value_counts(normalize=True) * 100

    formatted_percentages = value_percentages.apply(lambda x: f'{x:.2f}%')

    # Imprime el nombre de la columna y el porcentaje de cada valor
    print(f'Column: {column}')
    print(formatted_percentages)
    print()


Column: BMI
BMI
0.179532    1.09%
0.185632    0.86%
0.215637    0.74%
0.151892    0.72%
0.205304    0.67%
            ...  
0.128984    0.00%
0.510458    0.00%
0.352963    0.00%
0.364915    0.00%
0.295194    0.00%
Name: proportion, Length: 2154, dtype: object

Column: Smoking
Smoking
1    58.45%
0    41.55%
Name: proportion, dtype: object

Column: AlcoholDrinking
AlcoholDrinking
0    95.85%
1     4.15%
Name: proportion, dtype: object

Column: Stroke
Stroke
0    84.14%
1    15.86%
Name: proportion, dtype: object

Column: PhysicalHealth
PhysicalHealth
0.000000    52.95%
1.000000    16.75%
0.066667     4.23%
0.500000     3.37%
0.166667     3.30%
0.100000     3.09%
0.333333     3.05%
0.666667     2.36%
0.033333     2.20%
0.133333     1.80%
0.233333     1.68%
0.466667     1.12%
0.833333     0.92%
0.200000     0.64%
0.266667     0.45%
0.933333     0.39%
0.400000     0.37%
0.700000     0.35%
0.966667     0.22%
0.300000     0.12%
0.600000     0.12%
0.900000     0.10%
0.533333     0.08%
0.56666

In [563]:
correlation_matrix = data_copia.corr()

# Correlación con la columna 'HeartDisease'
correlation_with_heart_disease = correlation_matrix['HeartDisease'].abs().sort_values(ascending=False)

# Muestra las columnas con la mayor correlación con 'HeartDisease'
print("Columnas con mayor correlación con 'HeartDisease':")
print(correlation_with_heart_disease)


Columnas con mayor correlación con 'HeartDisease':
HeartDisease        1.000000
AgeCategory         0.438585
GenHealth           0.401708
DiffWalking         0.292410
Diabetic            0.268332
PhysicalHealth      0.239797
Stroke              0.223992
Smoking             0.191252
KidneyDisease       0.185989
PhysicalActivity    0.163501
SkinCancer          0.140199
Sex                 0.126689
BMI                 0.095810
Race_White          0.078293
Race_Hispanic       0.070174
Asthma              0.069121
AlcoholDrinking     0.064762
Race_Asian          0.058309
MentalHealth        0.045406
Race_Black          0.019508
SleepTime           0.016593
Race_Other          0.000950
Name: HeartDisease, dtype: float64


- K-nn model

In [564]:
from sklearn.neighbors import KNeighborsClassifier

k = 5
knn_model = KNeighborsClassifier(n_neighbors=k)

# Training of the model
knn_model.fit(char_train, obj_train)

# Evaluation of the model
predictions = knn_model.predict(char_test)


from sklearn.metrics import accuracy_score, recall_score, f1_score

accuracy = accuracy_score(obj_test, predictions)
recall = recall_score(obj_test, predictions)
f1 = f1_score(obj_test, predictions)

print(f'Accuracy: {accuracy:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')
print(data_copia.shape[0])

'''
from sklearn.neighbors import KNeighborsClassifier

weights_dict = {
    'AgeCategory': 1.5,
}
# Copia el conjunto de entrenamiento para no modificar el original
char_train_weighted = char_train.copy()  

for feature, weight in weights_dict.items():
    char_train_weighted[feature] *= weight

# Training of the model
k = 5
knn_model = KNeighborsClassifier(n_neighbors=k)
knn_model.fit(char_train_weighted, obj_train)

# Evaluation of the model
predictions = knn_model.predict(char_test)

from sklearn.metrics import accuracy_score, recall_score, f1_score

accuracy = accuracy_score(obj_test, predictions)
recall = recall_score(obj_test, predictions)
f1 = f1_score(obj_test, predictions)

print(f'Accuracy: {accuracy:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')
print(data_copia.shape[0])
'''

Accuracy: 0.59
Recall: 0.29
F1-Score: 0.42
50466
