In [14]:
import numpy as np
import seaborn as sns 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, r2_score, mean_absolute_error, mean_squared_error
from icecream import ic
import csv
import os
from datetime import datetime

## First try

In [20]:
CSV_PATH ="../data/cardio_optimized.csv"

df = pd.read_csv(CSV_PATH, sep = ";")
X = df.drop(columns=['cardio'])
Y = df['cardio']
X.shape, Y.shape


((68562, 14), (68562,))

In [21]:
df.head()

Unnamed: 0,age,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,female,male,imc,pressure
0,50.391781,168,62.0,110,80,1,1,0,0,0,0,0.0,1.0,1,2
1,55.419178,156,85.0,140,90,3,1,0,0,0,1,1.0,0.0,3,3
2,51.663014,165,64.0,130,70,3,1,0,0,1,1,1.0,0.0,1,2
3,48.282192,169,82.0,150,100,1,1,0,0,0,1,0.0,1.0,2,3
4,47.873973,156,56.0,100,60,1,1,0,0,1,0,1.0,0.0,1,1


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((54849, 14), (13713, 14), (54849,), (13713,))

---
### Fichier qui enregistre les résultat

In [None]:
def results(result_file, ligne, entetes=None):
    file_exists = os.path.exists(result_file)
    with open(result_file, mode='a', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=entetes or ligne.keys())
        if not file_exists:
            writer.writeheader()
        writer.writerow(ligne)


Accuracy : 0.7262

Rapport de classification :
              precision    recall  f1-score   support

           0       0.70      0.78      0.74      6848
           1       0.75      0.67      0.71      6865

    accuracy                           0.73     13713
   macro avg       0.73      0.73      0.73     13713
weighted avg       0.73      0.73      0.73     13713

Matrice de confusion :
[[5345 1503]
 [2251 4614]]


In [None]:
# Chemin du jeu de données
CSV_PATH
# Nom du jeu de données
data = os.path.basename(CSV_PATH)

In [None]:


# Entraînement du modèle
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Évaluation
accuracy = model.score(X_test, y_test)
y_pred = model.predict(X_test)

# Rapport de classification
rapport = classification_report(y_test, y_pred, output_dict=True)
precision = round(rapport['macro avg']['precision'], 4)
recall = round(rapport['macro avg']['recall'], 4)
f1 = round(rapport['macro avg']['f1-score'], 4)

# Matrice de confusion
cm = confusion_matrix(y_test, y_pred)
if cm.shape == (2, 2):
    tn, fp, fn, tp = cm.ravel()
else:
    tn = fp = fn = tp = None  # Gestion multiclasse possible ici

# Affichage
print("Accuracy :", round(accuracy, 4))
print("\nRapport de classification :")
print(classification_report(y_test, y_pred))
print("Matrice de confusion :")
print(cm)

# Résultats à enregistrer
result_line = {
    'Timestamp': datetime.now().strftime('%m-%d %H:%M:%S'),
    'FichierDonnees':data,
    'Accuracy': round(accuracy, 4),
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'True Negative': tn,
    'False Positive': fp,
    'False Negative': fn,
    'True Positive': tp
}

results('../results.csv', result_line, entetes=list(result_line.keys()))


In [None]:
result(PATH)

DESSOUS LE CODE D'ANTOINE

In [12]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

accuracy = model.score(X_test, y_test)
print("Accuracy :", round(accuracy, 4))

y_pred = model.predict(X_test)

print("\nRapport de classification :")
print(classification_report(y_test, y_pred))

print("Matrice de confusion :")
print(confusion_matrix(y_test, y_pred))

Accuracy : 0.727

Rapport de classification :
              precision    recall  f1-score   support

           0       0.70      0.78      0.74      6850
           1       0.76      0.67      0.71      6863

    accuracy                           0.73     13713
   macro avg       0.73      0.73      0.73     13713
weighted avg       0.73      0.73      0.73     13713

Matrice de confusion :
[[5367 1483]
 [2261 4602]]


## Deuxième démarche

Nous allons voir les classes qui impactes le plus la prédiction.

In [51]:
def make_confusion_matrix(X):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    model.score(X_train, y_train)
    y_pred = model.predict(X_test)
    matrix = classification_report(y_test, y_pred)
    return matrix

- Facteurs non modifiables

In [55]:
print(make_confusion_matrix(df[['age', 'gender_female', 'gender_male']]))

              precision    recall  f1-score   support

           0       0.61      0.58      0.59      6901
           1       0.59      0.62      0.60      6812

    accuracy                           0.60     13713
   macro avg       0.60      0.60      0.60     13713
weighted avg       0.60      0.60      0.60     13713



In [39]:
lifestyle_factors = df[['smoke', 'alco', 'active']]
lifestyle_factors.shape

(68562, 3)

In [41]:
upstream_clinical_factors = df[['weight', 'height']]
upstream_clinical_factors.shape

(68562, 2)

In [43]:
downstream_clinical_factors = df[['ap_hi', 'ap_lo', 'cholesterol', 'gluc']]
downstream_clinical_factors.shape

(68562, 4)

In [44]:
model_non_modifiable_factors = LogisticRegression()
model_non_modifiable_factors.

En fonction de la masse et du poid, faire une variable IMC