## Nettoyage des données du dataset sur les maladies cardiovasculaires

In [74]:
import pandas as pd

df = pd.read_csv('./data/cardio_train.csv', delimiter=';')
df.head() 

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [76]:
# L'age est donné en jour, alors on le convertit en année pour une meilleure lisibilité
df['age'] = (df['age']/365).astype(int)

# Je vais aussi convertir la taille en mètres, pour faciliter le calcul de l'IMC
df['height'] = (df['height']/100)

In [77]:
# On ajoute la colonne BMI (qui est l'IMC)
df['BMI'] = round(df['weight'] / df['height']**2, 0)
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,0,50,2,1.68,62.0,110,80,1,1,0,0,1,0,22.0
1,1,55,1,1.56,85.0,140,90,3,1,0,0,1,1,35.0
2,2,51,1,1.65,64.0,130,70,3,1,0,0,0,1,24.0
3,3,48,2,1.69,82.0,150,100,1,1,0,0,1,1,29.0
4,4,47,1,1.56,56.0,100,60,1,1,0,0,0,0,23.0


In [78]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,70000.0,49972.4199,28851.302323,0.0,25006.75,50001.5,74889.25,99999.0
age,70000.0,52.840671,6.766774,29.0,48.0,53.0,58.0,64.0
gender,70000.0,1.349571,0.476838,1.0,1.0,1.0,2.0,2.0
height,70000.0,1.643592,0.082101,0.55,1.59,1.65,1.7,2.5
weight,70000.0,74.20569,14.395757,10.0,65.0,72.0,82.0,200.0
ap_hi,70000.0,128.817286,154.011419,-150.0,120.0,120.0,140.0,16020.0
ap_lo,70000.0,96.630414,188.47253,-70.0,80.0,80.0,90.0,11000.0
cholesterol,70000.0,1.366871,0.68025,1.0,1.0,1.0,2.0,3.0
gluc,70000.0,1.226457,0.57227,1.0,1.0,1.0,1.0,3.0
smoke,70000.0,0.088129,0.283484,0.0,0.0,0.0,0.0,1.0


In [79]:
# Grâce à l'IMC, on peut chercher s'il y a des valeurs qui n'ont pas de sens pour le poids et la taille. Grâce à la description des données, 
# on a pu voir que la valeur max d'IMC est 298.7 ce qui est normalement impossible, de même pour la valeur min de 3.5
# Un IMC cohérent devrait se trouver entre 10 et 70 (en étant large), donc on cherche les valeurs qui sont sortent de cet intervalle.

outliers_BMI = df[(df['BMI'] < 10) | (df['BMI'] > 70)]
print(outliers_BMI.head())
print(outliers_BMI.shape)

# On va retirer ces valeurs du dataset pour ne pas fausser les données
df = df[(df['BMI'] >= 10) & (df['BMI'] <= 70)]

          id  age  gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  \
224      309   59       2    0.76    55.0    120     80            1     1   
6153    8757   57       1    1.22   161.0    120     80            1     1   
7598   10843   40       2    0.70    72.0    120      8            1     1   
8171   11662   48       2    0.97   170.0    160    100            1     1   
11230  16062   50       1    1.10    90.0    120     80            1     1   

       smoke  alco  active  cardio    BMI  
224        0     0       1       0   95.0  
6153       0     0       1       1  108.0  
7598       0     0       1       0  147.0  
8171       1     0       1       1  181.0  
11230      0     0       1       0   74.0  
(41, 14)


In [80]:
# Nous avons aussi pu voir qu'il y a des erreurs dans les données de pressions artérielles puisqu'il y a des valeurs négatives, et des mesures de plusieurs milliers, ce qui est impossible.
# On ne va garder que les valeurs possibles, donc entre 90 et 180 pour les pressions artérielles systoliques (ap_hi) et entre 60 et 120 pour les pressions artérielles diastoliques (ap_lo).
df = df[(df['ap_hi'] >= 90) & (df['ap_hi'] <= 180)]
df = df[(df['ap_lo'] >= 60) & (df['ap_lo'] <= 120)]

In [81]:
# Vérification que pression systolique > pression diastolique
invalid_bp = df[df['ap_hi'] <= df['ap_lo']]
print(invalid_bp)

# On inverse les colonnes quand on se trouve dans un cas d'erreur
df.loc[df['ap_hi'] < df['ap_lo'], ['ap_hi', 'ap_lo']] = df.loc[df['ap_hi'] < df['ap_lo'], ['ap_lo', 'ap_hi']].values

          id  age  gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  \
12785  18238   43       2    1.78   105.0    100    100            1     1   
22896  32702   56       2    1.78    84.0     95    100            1     1   
32190  45975   63       1    1.62   120.0    100    100            1     1   
46517  66423   41       2    1.78    92.0     90    100            1     1   
52768  75277   57       1    1.64    86.0     90    120            1     1   

       smoke  alco  active  cardio   BMI  
12785      0     0       0       1  33.0  
22896      0     1       0       1  27.0  
32190      0     0       1       0  46.0  
46517      1     0       1       0  29.0  
52768      0     1       1       1  32.0  


In [82]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,68168.0,49974.887572,28852.818803,0.0,24994.75,50018.0,74880.25,99999.0
age,68168.0,52.823583,6.769752,29.0,48.0,53.0,58.0,64.0
gender,68168.0,1.348639,0.476543,1.0,1.0,1.0,2.0,2.0
height,68168.0,1.64412,0.07965,0.91,1.59,1.65,1.7,2.5
weight,68168.0,74.092167,14.245211,28.0,65.0,72.0,82.0,200.0
ap_hi,68168.0,126.434749,15.959737,90.0,120.0,120.0,140.0,180.0
ap_lo,68168.0,81.261941,9.141618,60.0,80.0,80.0,90.0,120.0
cholesterol,68168.0,1.363338,0.678147,1.0,1.0,1.0,1.0,3.0
gluc,68168.0,1.225252,0.571373,1.0,1.0,1.0,1.0,3.0
smoke,68168.0,0.087651,0.282789,0.0,0.0,0.0,0.0,1.0


In [83]:
# On observe qu'il n'y a pas de valeurs supplémentaires pour nos valeurs catégorielles et binaires
df.nunique()

id             68168
age               28
gender             2
height            86
weight           271
ap_hi             86
ap_lo             58
cholesterol        3
gluc               3
smoke              2
alco               2
active             2
cardio             2
BMI               59
dtype: int64

In [84]:
# On va remplacer des valeurs numériques en valeurs textuels pour un meilleur affichage.

# Dictionnaires de mappage
cholesterol_map = {1: 'normal', 2: 'above normal', 3: 'well above normal'}
glucose_map = {1: 'normal', 2: 'above normal', 3: 'well above normal'}
gender_map = {1: 'woman', 2: 'man'}
smoke_map = {0: 'non smoker', 1: 'smoker'}
alco_map = {0: 'no alcohol', 1: 'alcohol'}
active_map = {0: 'not active', 1: 'active'}

# Application aux colonnes
df['cholesterol'] = df['cholesterol'].replace(cholesterol_map)
df['gluc'] = df['gluc'].replace(glucose_map)
df['gender'] = df['gender'].replace(gender_map)
df['smoke'] = df['smoke'].replace(smoke_map)
df['alco'] = df['alco'].replace(alco_map)
df['active'] = df['active'].replace(active_map)

In [85]:
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,0,50,man,1.68,62.0,110,80,normal,normal,non smoker,no alcohol,active,0,22.0
1,1,55,woman,1.56,85.0,140,90,well above normal,normal,non smoker,no alcohol,active,1,35.0
2,2,51,woman,1.65,64.0,130,70,well above normal,normal,non smoker,no alcohol,not active,1,24.0
3,3,48,man,1.69,82.0,150,100,normal,normal,non smoker,no alcohol,active,1,29.0
4,4,47,woman,1.56,56.0,100,60,normal,normal,non smoker,no alcohol,not active,0,23.0


In [86]:
# Enregistrer le dataframe dans un fichier CSV
df.to_csv('data/cardio_train_clean.csv', index=False)