In [7]:
import numpy as np
import seaborn as sns 
import pandas as pd 
import matplotlib.pyplot as plt 

In [14]:
df = pd.read_csv("../data/cardio_train.csv", sep = ";", index_col="id")
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
 11  cardio       70000 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 6.9 MB


## Détection des valeurs abérentes

- 🔹 AP_HIGH : Systolic Blood Pressure (mmHg)

| État                         | Pression systolique |
|------------------------------|----------------------|
| Tension basse (hypotension) | < 90                 |
| Normale                     | 90 – 120             |
| Pré-hypertension / normale haute | 120 – 139       |
| Hypertension (stade 1)      | 140 – 159            |
| Hypertension (stade 2)      | ≥ 160                |
| Urgence hypertensive (extrême) | ≥ 180            |
| Limite extrême tolérable    | Jusqu’à 250          |
|<font color="orange">ON SUPRIME les valeur <70</font> |<font color="orange">soit 189 observations|
|<font color="orange">ON SUPRIME les valeur >250|<font color="orange"> soit 40 observations|


---

- 🔹 AP_LOW : Diastolic Blood Pressure (mmHg)

| État                         | Pression diastolique |
|------------------------------|-----------------------|
| Tension basse (hypotension) | < 60                  |
| Normale                     | 60 – 80               |
| Pré-hypertension            | 80 – 89               |
| Hypertension (stade 1)      | 90 – 99               |
| Hypertension (stade 2)      | ≥ 100                 |
| Urgence hypertensive (critique) | ≥ 120            |
| Limite maximale tolérable   | Jusqu’à 150           |
|<font color="orange">ON SUPRIME les valeur <40 |<font color="orange">soit 59 observations|
|<font color="orange">ON SUPRIME les valeur >150 |<font color="orange">soit 975 observations|

---

- 🔹 HEIGHT (Taille en cm)

| Seuil     | Explication                          |
|-----------|--------------------------------------|
| < 100 cm  | Trop petit (erreur ou enfant < 5 ans)|
| > 250 cm  | Extrêmement rare, probablement une erreur |
|<font color="orange">ON SUPRIME les valeur <140 | <font color="orange">soit 152 observations|
|<font color="orange">ON SUPRIME les valeur >220 | <font color="orange">soit 1 observations|

---

- 🔹 WEIGHT (Poids en kg)

| Seuil     | Explication                             |
|-----------|-----------------------------------------|
| < 30 kg   | Très maigre (enfant ou erreur)          |
| > 250 kg  | Extrêmement obèse ou mal enregistré     |
|<font color="orange">ON SUPRIME les valeur <40 |<font color="orange">soit 52 observations|
|<font color="orange">ON SUPRIME les valeur >200 | <font color="orange">soit 0 observations|


In [9]:
print(f"""
----------AP_HI----------------------------------
Valeurs < 70 : {(df['ap_hi'] < 70).sum()}
Valeurs > 250 : {(df['ap_hi'] > 250).sum()}
-------------------------------------------------
----------AP_LO----------------------------------
Valeurs < 40 : {(df['ap_lo'] < 40).sum()}
Valeurs > 150 : {(df['ap_lo'] > 150).sum()}
-------------------------------------------------
----------HEIGHT---------------------------------
Valeurs < 140 cm : {(df['height'] < 140).sum()}
Valeurs > 220 cm : {(df['height'] > 220).sum()}
-------------------------------------------------
----------WEIGHT---------------------------------
Valeurs < 40 kg : {(df['weight'] < 40).sum()}
Valeurs > 200 kg : {(df['weight'] > 200).sum()}
-------------------------------------------------
""")



----------AP_HI----------------------------------
Valeurs < 70 : 189
Valeurs > 250 : 40
-------------------------------------------------
----------AP_LO----------------------------------
Valeurs < 40 : 59
Valeurs > 150 : 975
-------------------------------------------------
----------HEIGHT---------------------------------
Valeurs < 140 cm : 152
Valeurs > 220 cm : 1
-------------------------------------------------
----------WEIGHT---------------------------------
Valeurs < 40 kg : 52
Valeurs > 200 kg : 0
-------------------------------------------------



## Suppression des données abérantes

In [10]:
df_clean = df[
    (df['ap_hi'] >= 70) & (df['ap_hi'] <= 250) &
    (df['ap_lo'] >= 40) & (df['ap_lo'] <= 150) &
    (df['height'] >= 140) & (df['height'] <= 220) &
    (df['weight'] >= 40) & (df['weight'] <= 200)
]
df_clean.describe()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,68562.0,68562.0,68562.0,68562.0,68562.0,68562.0,68562.0,68562.0,68562.0,68562.0,68562.0,68562.0
mean,19464.16989,1.349042,164.47115,74.143089,126.625959,81.364954,1.364969,1.225898,0.088125,0.053616,0.803433,0.494939
std,2467.602571,0.476671,7.797999,14.255469,16.738005,9.550331,0.679182,0.571863,0.283478,0.225259,0.397405,0.499978
min,10798.0,1.0,140.0,40.0,70.0,40.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,17658.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,19700.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,21323.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,23713.0,2.0,207.0,200.0,240.0,150.0,3.0,3.0,1.0,1.0,1.0,1.0


In [11]:
print(df.shape[0] - df_clean.shape[0])

1438


## Export du csv nettoyé

In [13]:
df_clean.to_csv('../data/cardio_train_clean.csv', sep = ";", index=False)