## Encodage des symptômes

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

df = pd.read_csv("dataset.csv")

# Rassembler toutes les colonnes de symptômes
sympt_cols = [f"Symptom_{i}" for i in range(1, 18)]

# Transformer chaque ligne en une liste unique de symptômes (en supprimant les NaN)
df["liste_symptomes"] = df[sympt_cols].apply(lambda x: [s for s in x if pd.notna(s)], axis=1)

# Extraire la liste complète de tous les symptômes existants
tous_symptomes = sorted(set(sum(df["liste_symptomes"], [])))

# Créer les features binaires
for sympt in tous_symptomes:
    df[sympt] = df["liste_symptomes"].apply(lambda lst: int(sympt in lst))

X = df[tous_symptomes]
y = df["Disease"]

# y contient les maladies sous forme de texte
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Tu peux afficher la correspondance maladie ↔ code
mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(mapping)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


{'(vertigo) Paroymsal  Positional Vertigo': np.int64(0), 'AIDS': np.int64(1), 'Acne': np.int64(2), 'Alcoholic hepatitis': np.int64(3), 'Allergy': np.int64(4), 'Arthritis': np.int64(5), 'Bronchial Asthma': np.int64(6), 'Cervical spondylosis': np.int64(7), 'Chicken pox': np.int64(8), 'Chronic cholestasis': np.int64(9), 'Common Cold': np.int64(10), 'Dengue': np.int64(11), 'Diabetes ': np.int64(12), 'Dimorphic hemmorhoids(piles)': np.int64(13), 'Drug Reaction': np.int64(14), 'Fungal infection': np.int64(15), 'GERD': np.int64(16), 'Gastroenteritis': np.int64(17), 'Heart attack': np.int64(18), 'Hepatitis B': np.int64(19), 'Hepatitis C': np.int64(20), 'Hepatitis D': np.int64(21), 'Hepatitis E': np.int64(22), 'Hypertension ': np.int64(23), 'Hyperthyroidism': np.int64(24), 'Hypoglycemia': np.int64(25), 'Hypothyroidism': np.int64(26), 'Impetigo': np.int64(27), 'Jaundice': np.int64(28), 'Malaria': np.int64(29), 'Migraine': np.int64(30), 'Osteoarthristis': np.int64(31), 'Paralysis (brain hemorrhag

  df[sympt] = df["liste_symptomes"].apply(lambda lst: int(sympt in lst))
  df[sympt] = df["liste_symptomes"].apply(lambda lst: int(sympt in lst))
  df[sympt] = df["liste_symptomes"].apply(lambda lst: int(sympt in lst))
  df[sympt] = df["liste_symptomes"].apply(lambda lst: int(sympt in lst))
  df[sympt] = df["liste_symptomes"].apply(lambda lst: int(sympt in lst))
  df[sympt] = df["liste_symptomes"].apply(lambda lst: int(sympt in lst))
  df[sympt] = df["liste_symptomes"].apply(lambda lst: int(sympt in lst))
  df[sympt] = df["liste_symptomes"].apply(lambda lst: int(sympt in lst))
  df[sympt] = df["liste_symptomes"].apply(lambda lst: int(sympt in lst))
  df[sympt] = df["liste_symptomes"].apply(lambda lst: int(sympt in lst))
  df[sympt] = df["liste_symptomes"].apply(lambda lst: int(sympt in lst))
  df[sympt] = df["liste_symptomes"].apply(lambda lst: int(sympt in lst))
  df[sympt] = df["liste_symptomes"].apply(lambda lst: int(sympt in lst))
  df[sympt] = df["liste_symptomes"].apply(lambda ls

## Entrainement du modèle

In [24]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

0,1,2
,objective,'multi:softprob'
,use_label_encoder,
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,early_stopping_rounds,
,enable_categorical,False


In [25]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 1.0


## Conversion ONXX

In [26]:
# Force numeric feature names
model.get_booster().feature_names = [f"f{i}" for i in range(X.shape[1])]

# Now convert
from onnxmltools.convert import convert_xgboost
from onnxmltools.convert.common.data_types import FloatTensorType

initial_type = [('input', FloatTensorType([None, X.shape[1]]))]
onnx_model = convert_xgboost(model, initial_types=initial_type)

with open("healthcare_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())


## Quantization

In [29]:
import onnx

# Charger le modèle
model = onnx.load("healthcare_model.onnx")

# Vérifier les domaines présents
print([op.domain for op in model.opset_import])

# Ajouter le domaine 'ai.onnx' si absent
if not any(op.domain in ("", "ai.onnx") for op in model.opset_import):
    from onnx import OperatorSetIdProto
    opset = OperatorSetIdProto()
    opset.domain = "ai.onnx"
    opset.version = 17  # par exemple
    model.opset_import.append(opset)

# Sauvegarder la version corrigée
onnx.save(model, "healthcare_model_fixed.onnx")
print("✅ Modèle ONNX corrigé enregistré.")

# Relancer la quantification
from onnxruntime.quantization import quantize_dynamic, QuantType

quantize_dynamic(
    model_input="healthcare_model_fixed.onnx",
    model_output="healthcare_model_quantized.onnx",
    weight_type=QuantType.QInt8
)




['ai.onnx.ml']
✅ Modèle ONNX corrigé enregistré.


## Génération du label.txt

In [5]:
# Créer un fichier label.txt avec la correspondance maladie ↔ code
with open("label.txt", "w") as f:
    for maladie, code in mapping.items():
        f.write(f"{maladie}: {code}\n")

print("Le fichier label.txt a été créé avec succès !")


Le fichier label.txt a été créé avec succès !


## Génération du symptom.txt

In [10]:

with open("symptomes.txt", "w", encoding="utf-8") as f:
    for sympt in tous_symptomes:
        f.write(sympt + "\n")

print(f"Fichier 'symptomes.txt' créé avec {len(tous_symptomes)} symptômes.")

Fichier 'symptomes.txt' créé avec 131 symptômes.
