In [8]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    roc_auc_score,
    average_precision_score,
)
import joblib

# === 1. Charger les données ===
df = pd.read_csv(r"C:\Users\REINA\BRCA2_Pathogenicity/data/processed/combined/brca2_combined_features_expanded.csv")

# === 2. Extraire X et y ===
X = df.iloc[:, 2:].values  # features numériques
y = df["Label"].astype(int).values

# === 3. Diviser les données ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# === 4. Normalisation ===
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# === 5. Entraîner le classifieur ===
clf = MLPClassifier(
    hidden_layer_sizes=(512, 128),
    activation='relu',
    solver='adam',
    learning_rate_init=0.001,
    batch_size=32,
    max_iter=300,
    random_state=42
)

clf.fit(X_train, y_train)

# === 6. Évaluation ===
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]  # Probabilités pour la classe positive

print("✅ Accuracy :", accuracy_score(y_test, y_pred))
print("\n📊 Rapport de classification :")
print(classification_report(y_test, y_pred))

# Calcul AUROC et AUPR
auroc = roc_auc_score(y_test, y_proba)
aupr = average_precision_score(y_test, y_proba)

print(f"🔹 AUROC : {auroc:.4f}")
print(f"🔹 AUPR  : {aupr:.4f}")

# === 7. Sauvegarde ===
model_dir = r"C:\Users\REINA\BRCA2_Pathogenicity\models"
os.makedirs(model_dir, exist_ok=True)

joblib.dump(clf, os.path.join(model_dir, "brca2_mlp_model_expanded.pkl"))
joblib.dump(scaler, os.path.join(model_dir, "brca2_scaler_expanded.pkl"))
joblib.dump(df, os.path.join(model_dir, "brca2_dataframe_expanded.pkl"))

print(f"✅ Modèle, scaler et dataframe sauvegardés dans {model_dir}")

# === 8. Chargement des objets sauvegardés ===
clf = joblib.load(os.path.join(model_dir, "brca2_mlp_model_expanded.pkl"))
scaler = joblib.load(os.path.join(model_dir, "brca2_scaler_expanded.pkl"))
df = joblib.load(os.path.join(model_dir, "brca2_dataframe_expanded.pkl"))

# === 9. Fonction prédiction à partir du variant ===
def predict_label_from_variant(variant_str):
    row = df.loc[df["Variant"] == variant_str]
    if row.empty:
        print(f"Variant '{variant_str}' non trouvé dans le dataset.")
        return None
    features_vector = row.iloc[:, 2:].values
    features_scaled = scaler.transform(features_vector)
    prediction = clf.predict(features_scaled)
    return prediction[0]

# === Exemple d’utilisation ===
variant_example = "c.53G>A"
label_pred = predict_label_from_variant(variant_example)
print(f"Prédiction du label pour le variant {variant_example} : {label_pred}")

variant_example = "c.9285C>G"
label_pred = predict_label_from_variant(variant_example)
print(f"Prédiction du label pour le variant {variant_example} : {label_pred}")


✅ Accuracy : 0.7608695652173914

📊 Rapport de classification :
              precision    recall  f1-score   support

           0       0.84      0.81      0.83        32
           1       0.60      0.64      0.62        14

    accuracy                           0.76        46
   macro avg       0.72      0.73      0.72        46
weighted avg       0.77      0.76      0.76        46

🔹 AUROC : 0.8036
🔹 AUPR  : 0.7348
✅ Modèle, scaler et dataframe sauvegardés dans C:\Users\REINA\BRCA2_Pathogenicity\models
Prédiction du label pour le variant c.53G>A : 0
Prédiction du label pour le variant c.9285C>G : 1
