In [1]:
import matplotlib as plt
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
pd_Data = pd.read_csv("heart_2020_cleaned.csv")
pd_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

- HeartDisease = Bool
- BMI = Float
- Smoking = Bool
- AlcoholDrinking = Bool
- Stroke = Bool
- PhysicalHealt = Float
- MentalHealth = Float
- DiffWalking = Bool
- Sex = Catégorique
- AgeCategory = Catégorique
- Race = Catégorique
- Diabetic = Bool
- PhysicalActivity = Bool
- GenHealth = Catégorique
- SleepTime = Float
- Asthme = Bool
- KidneyDisease = Bool
- SkinCancer = Bool


In [3]:
pd_Data.head

<bound method NDFrame.head of        HeartDisease    BMI Smoking AlcoholDrinking Stroke  PhysicalHealth  \
0                No  16.60     Yes              No     No             3.0   
1                No  20.34      No              No    Yes             0.0   
2                No  26.58     Yes              No     No            20.0   
3                No  24.21      No              No     No             0.0   
4                No  23.71      No              No     No            28.0   
...             ...    ...     ...             ...    ...             ...   
319790          Yes  27.41     Yes              No     No             7.0   
319791           No  29.84     Yes              No     No             0.0   
319792           No  24.24      No              No     No             0.0   
319793           No  32.81      No              No     No             0.0   
319794           No  46.56      No              No     No             0.0   

        MentalHealth DiffWalking     Sex  Age

In [4]:
pd_Data.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

- Changer les Bool, Yes/No par 1/0
- Encoding les catégorique : Sex, AgeCategory, Race, GenHealth
- Normaliser les Floats : BMI, PhysicalHealth, MentalHealth, SleepTime

# Transformer les Variables booléenne de "Yes/No" à 1 et 0.

In [5]:
V_bool = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Diabetic', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']

for cat in V_bool:
    pd_Data[cat] = np.where(pd_Data[cat] == 'Yes', 1, 0)


# Normaliser les variable numériques

In [6]:
for df in ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']:
    pd_Data[df] = (pd_Data[df] - pd_Data[df].min()) / (pd_Data[df].max() - pd_Data[df].min())

# Encoder les variables Catégoriques

In [None]:
# J'ai essayé de les faire d'un coup avec une liste, mais sans succès; donc je les fait une par une.
pd_Data_Encoded = pd.get_dummies(pd_Data, columns=['Sex'], drop_first=True)
pd_Data_Encoded = pd.get_dummies(pd_Data_Encoded, columns=['AgeCategory'])
pd_Data_Encoded = pd.get_dummies(pd_Data_Encoded, columns=['GenHealth'])
pd_Data_Encoded = pd.get_dummies(pd_Data_Encoded, columns=['Race'])
#print(pd_Data_Encoded.columns)
#print(pd_Data_Encoded.head)

# Séparer les données en Train et Test et les étiquettes

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    pd_Data_Encoded.drop(['HeartDisease'], axis=1),
    pd_Data_Encoded['HeartDisease'], # the target
    test_size = 0.2,
    random_state=42)

# sauvegarder les données transformés
X_train.to_csv("heart_X_train.csv")
X_test.to_csv("heart_X_test.csv")
y_train.to_csv("heart_y_train.csv")
y_test.to_csv("heart_y_test.csv")

# Validation de la séparation.
X_train.shape, X_test.shape

((255836, 38), (63959, 38))

# Entrainer le model

In [9]:
HD_model = LogisticRegression()

HD_model.fit(X_train, y_train)
y_pred = HD_model.predict(X_test)

print(f'Accuracy_score: {accuracy_score(y_test,y_pred)}')
print(f'Precission_score: {precision_score(y_test,y_pred)}')
print(f'Recall_score: {recall_score(y_test,y_pred)}')
print(f'F1-score: {f1_score(y_test,y_pred)}')

Accuracy_score: 0.9138197908034835
Precission_score: 0.5384615384615384
Recall_score: 0.10014306151645208
F1-score: 0.16887816646562123


# Sauvegarder le model

In [10]:
pickle.dump(HD_model, open("HeartDisease.h5", "wb"))