In [4]:
import numpy as np
import seaborn as sns 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [5]:
df = pd.read_csv("../data/cardio_train_clean.csv", sep = ";", index_col="id")
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
Index: 68562 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          68562 non-null  float64
 1   gender       68562 non-null  int64  
 2   height       68562 non-null  int64  
 3   weight       68562 non-null  float64
 4   ap_hi        68562 non-null  int64  
 5   ap_lo        68562 non-null  int64  
 6   cholesterol  68562 non-null  int64  
 7   gluc         68562 non-null  int64  
 8   smoke        68562 non-null  int64  
 9   alco         68562 non-null  int64  
 10  active       68562 non-null  int64  
 11  cardio       68562 non-null  int64  
dtypes: float64(2), int64(10)
memory usage: 6.8 MB


(68562, 12)

In [6]:
df.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,50.391781,2,168,62.0,110,80,1,1,0,0,1,0
1,55.419178,1,156,85.0,140,90,3,1,0,0,1,1
2,51.663014,1,165,64.0,130,70,3,1,0,0,0,1
3,48.282192,2,169,82.0,150,100,1,1,0,0,1,1
4,47.873973,1,156,56.0,100,60,1,1,0,0,0,0


---

### Activité physique

|Variable|interpretation| valeur élevé =|
|:-|:-|:-|
| CHOLESTEROL | 1 : normal<br> 2 : supérieur à la normale<br> 3 : largement supérieur à la normale| incidence négative|
| GLUCOSE     | 1 : normal<br> 2 : supérieur à la normale<br> 3 : largement supérieur à la normale| incidence négative|
| SMOKE       | 0 : non fumeur<br>1 : fumeur| incidence négative|
| ALCOHOL     | 0 : non consomateur<br> 1 : consomateur| incidence négative|
| PHYSICAL_ACTIVITY | 0 : non, 1 : oui| incidence positive|



Pour l'activité physique, la logique est inversée<br>
Nous alons donc intervertir les 0 et 1 de `active` pour respecter la même logique que sur les autre variables.

In [7]:
df['active'] = 1 - df['active']

In [8]:
df.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,50.391781,2,168,62.0,110,80,1,1,0,0,0,0
1,55.419178,1,156,85.0,140,90,3,1,0,0,0,1
2,51.663014,1,165,64.0,130,70,3,1,0,0,1,1
3,48.282192,2,169,82.0,150,100,1,1,0,0,0,1
4,47.873973,1,156,56.0,100,60,1,1,0,0,1,0


----
## gender
2 = homme<br>
1 = femme<br>
Mais il s'agit d'une donnée Cardinale, pas ordinale><br>
Nous utilisons OneHotEncoder pour remplacer `gender` par `male`et `female`.

In [9]:
encoder = OneHotEncoder(sparse_output=False)

# Encode la colonne 'gender'
gender_encoded = encoder.fit_transform(df[['gender']])

# Corriger l’accès aux catégories
categories = encoder.categories_[0]  # Extraire la seule liste
column_names = ['female' if val == 1 else 'male' for val in categories]

# Créer un DataFrame avec les nouvelles colonnes
gender_df = pd.DataFrame(gender_encoded, columns=column_names, index=df.index)

# Remplacer la colonne d’origine
df = pd.concat([df.drop(columns='gender'), gender_df], axis=1)

In [10]:
df.head()

Unnamed: 0_level_0,age,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,female,male
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,50.391781,168,62.0,110,80,1,1,0,0,0,0,0.0,1.0
1,55.419178,156,85.0,140,90,3,1,0,0,0,1,1.0,0.0
2,51.663014,165,64.0,130,70,3,1,0,0,1,1,1.0,0.0
3,48.282192,169,82.0,150,100,1,1,0,0,0,1,0.0,1.0
4,47.873973,156,56.0,100,60,1,1,0,0,1,0,1.0,0.0


---
## IMC

### Indice de Masse Corporelle

Calcul de l'IMC :

$$
IMC = \frac{\text{poids (kg)}}{\left( \frac{\text{taille (cm)}}{100} \right)^2}
$$

​
Catégorisation médicale simplifiée :

|IMC	|Catégorie|	Valeur|
|-|-|-|
|< 25	|Normal	|1|
|25 ≤ IMC < 30	|Surpoids |	2|
|≥ 30	|Obésité |	3|

In [11]:
# Calcul de l'IMC
df['imc'] = df['weight'] / ((df['height'] / 100) ** 2)

# Création de la variable catégorielle
def categoriser_imc(imc):
    if imc < 25:
        return 1  # Normal
    elif imc < 30:
        return 2  # Supérieur à la normale
    else:
        return 3  # Très supérieur à la normale

df['imc'] = df['imc'].apply(categoriser_imc)


In [12]:
df.head()

Unnamed: 0_level_0,age,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,female,male,imc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,50.391781,168,62.0,110,80,1,1,0,0,0,0,0.0,1.0,1
1,55.419178,156,85.0,140,90,3,1,0,0,0,1,1.0,0.0,3
2,51.663014,165,64.0,130,70,3,1,0,0,1,1,1.0,0.0,1
3,48.282192,169,82.0,150,100,1,1,0,0,0,1,0.0,1.0,2
4,47.873973,156,56.0,100,60,1,1,0,0,1,0,1.0,0.0,1


---
## pressure

### Tension artérielle

| Diastolique (`ap_lo`) | Systolique (`ap_hi`) | Interprétation        | Code |
| -------------------- | --------------------- | --------------------- | ---- |
| < 80                | < 120                | Normale               | 1    |
| 80–89             | 120–139               | Élevée (à surveiller) | 2    |
| ≥ 90                | ≥ 140                  | Hypertension          | 3    |


Si la systolique ou la diastolique dépasse un seuil, on prend la catégorie la plus élevée.

In [13]:
def classifier_pressure(row):

    if row['ap_hi'] < 120 and row['ap_lo']< 80:
        return 1  # Normal
    elif row['ap_hi'] < 140 and row['ap_lo'] < 90:
        return 2  # A surveiller
    else:
        return 3  # Hypertension

df['pressure'] = df.apply(classifier_pressure, axis=1)


In [14]:
df.head()

Unnamed: 0_level_0,age,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,female,male,imc,pressure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,50.391781,168,62.0,110,80,1,1,0,0,0,0,0.0,1.0,1,2
1,55.419178,156,85.0,140,90,3,1,0,0,0,1,1.0,0.0,3,3
2,51.663014,165,64.0,130,70,3,1,0,0,1,1,1.0,0.0,1,2
3,48.282192,169,82.0,150,100,1,1,0,0,0,1,0.0,1.0,2,3
4,47.873973,156,56.0,100,60,1,1,0,0,1,0,1.0,0.0,1,1


---
## Export du csv optimisé

In [18]:
df.to_csv('../data/cardio_optimized.csv', sep = ";")