In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report, accuracy_score
from collections import Counter
import time
import os

# --- CONFIGURATION ---
CHEMIN_FICHIER_EQUILIBRE = 'C:\\Users\\hp\\Desktop\\TPs\\DataMining\\preparing\\Final_Reduit30_70.csv'
COLONNE_CIBLE = 'classe'
K_VOISINS = 5 
TEST_SIZE = 0.2
DOSSIER_PLOTS = 'C:\\Users\\hp\\Desktop\\TPs\\DataMining\\Results\\plots_knn' 
# ---------------------

class KNNVectorized:
    # [La d√©finition compl√®te de votre classe KNNVectorized est maintenue ici]
    
    def __init__(self, k=5):
        self.k = k
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def _euclidean_distance(self, X_test):
        X_test_sq = np.sum(X_test**2, axis=1, keepdims=True)
        X_train_sq = np.sum(self.X_train**2, axis=1)
        dot_product = np.dot(X_test, self.X_train.T)
        sq_distances = X_test_sq + X_train_sq - 2 * dot_product
        return np.sqrt(np.maximum(sq_distances, 0))

    def _get_k_nearest_labels(self, distances_matrix):
        k_nearest_indices = np.argsort(distances_matrix, axis=1)[:, :self.k]
        return self.y_train[k_nearest_indices]

    def predict(self, X_test):
        X_test = np.array(X_test)
        distances_matrix = self._euclidean_distance(X_test)
        k_nearest_labels = self._get_k_nearest_labels(distances_matrix)
        predictions = np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=1, arr=k_nearest_labels)
        return predictions

    def predict_proba(self, X_test):
        X_test = np.array(X_test)
        distances_matrix = self._euclidean_distance(X_test)
        k_nearest_labels = self._get_k_nearest_labels(distances_matrix)
        proba_class_1 = np.mean(k_nearest_labels == 1, axis=1)
        proba_class_0 = 1 - proba_class_1
        return np.column_stack((proba_class_0, proba_class_1))


# --- Fonctions de Plotting Individuelles ---

def plot_and_save_confusion_matrix(y_test, y_pred, k, folder):
    """G√©n√®re et enregistre la matrice de confusion."""
    cm = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(7, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Pr√©dit Non-Feu', 'Pr√©dit Feu'],
                yticklabels=['R√©el Non-Feu', 'R√©el Feu'],
                cbar=False)
    plt.title(f'Matrice de Confusion k-NN (k={k})', fontsize=14)
    plt.xlabel('Pr√©diction', fontsize=12)
    plt.ylabel('R√©el', fontsize=12)
    
    filename = os.path.join(folder, f'knn_k{k}_confusion_matrix.png')
    plt.savefig(filename, bbox_inches='tight', dpi=300)
    plt.close() # Ferme la figure pour lib√©rer la m√©moire
    return filename

def plot_and_save_roc_curve(y_test, y_proba, k, folder):
    """G√©n√®re et enregistre la courbe ROC et l'AUC."""
    fpr, tpr, thresholds = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(7, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=3, 
             label=f'Courbe ROC (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Al√©atoire')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Taux de Faux Positifs (FPR)', fontsize=12)
    plt.ylabel('Taux de Vrais Positifs (TPR - Rappel)', fontsize=12)
    plt.title(f'Courbe ROC k-NN (k={k})', fontsize=14)
    plt.legend(loc="lower right")
    
    filename = os.path.join(folder, f'knn_k{k}_roc_curve.png')
    plt.savefig(filename, bbox_inches='tight', dpi=300)
    plt.close() # Ferme la figure pour lib√©rer la m√©moire
    return filename, roc_auc


# --- Ex√©cution Principale ---

def run_knn_vectorized_and_save_plots(k=K_VOISINS):
    try:
        # Cr√©er le dossier pour les plots s'il n'existe pas
        if not os.path.exists(DOSSIER_PLOTS):
            os.makedirs(DOSSIER_PLOTS)
            print(f"Dossier cr√©√© : {DOSSIER_PLOTS}")

        # 1. Pr√©paration des Donn√©es
        print("Chargement des donn√©es et standardisation...")
        df_final = pd.read_csv(CHEMIN_FICHIER_EQUILIBRE)
        
        X = df_final.drop(columns=[COLONNE_CIBLE, 'latitude', 'longitude'], errors='ignore') 
        y = df_final[COLONNE_CIBLE] 

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=TEST_SIZE, random_state=42, stratify=y
        )
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # 2. Entra√Ænement et Pr√©dictions
        
        knn_model_vectorized = KNNVectorized(k=k)
        knn_model_vectorized.fit(X_train_scaled, y_train.values)
        
        y_pred = knn_model_vectorized.predict(X_test_scaled)
        y_proba = knn_model_vectorized.predict_proba(X_test_scaled)[:, 1] # P(Classe 1)

        # 3. Affichage des M√©triques
        print("\n" + "="*50)
        print(f"  R√âSULTATS DE CLASSIFICATION k-NN VECTORIS√â (k={k})")
        print("="*50)
        print(f"Pr√©cision globale (Accuracy): {accuracy_score(y_test, y_pred)*100:.2f}%\n")
        print("Rapport de Classification:\n", classification_report(y_test, y_pred, target_names=['Non-Feu (0)', 'Feu (1)']))
        
        # 4. VISUALISATIONS ET ENREGISTREMENT INDIVIDUELS
        
        cm_filename = plot_and_save_confusion_matrix(y_test, y_pred, k, DOSSIER_PLOTS)
        roc_filename, roc_auc = plot_and_save_roc_curve(y_test, y_proba, k, DOSSIER_PLOTS)
        
        print(f"\n‚úÖ Matrice de Confusion sauvegard√©e : {cm_filename}")
        print(f"‚úÖ Courbe ROC sauvegard√©e : {roc_filename}")
        print(f"L'AUC (Area Under the Curve) est de : {roc_auc:.4f}")
        
        # Affiche un message pour indiquer que l'ex√©cution est termin√©e
        # Les figures individuelles sont ferm√©es (plt.close()) apr√®s l'enregistrement.
        
    except FileNotFoundError:
        print(f"‚ùå Erreur : Le fichier d'entr√©e n'a pas √©t√© trouv√©.")
    except Exception as e:
        print(f"\n‚ùå Une erreur inattendue est survenue : {e}")

# Lancer la fonction
run_knn_vectorized_and_save_plots(k=K_VOISINS)

Chargement des donn√©es et standardisation...

  R√âSULTATS DE CLASSIFICATION k-NN VECTORIS√â (k=5)
Pr√©cision globale (Accuracy): 92.35%

Rapport de Classification:
               precision    recall  f1-score   support

 Non-Feu (0)       0.96      0.93      0.94      6189
     Feu (1)       0.84      0.92      0.88      2652

    accuracy                           0.92      8841
   macro avg       0.90      0.92      0.91      8841
weighted avg       0.93      0.92      0.92      8841


‚úÖ Matrice de Confusion sauvegard√©e : C:\Users\hp\Desktop\TPs\DataMining\Results\plots_knn\knn_k5_confusion_matrix.png
‚úÖ Courbe ROC sauvegard√©e : C:\Users\hp\Desktop\TPs\DataMining\Results\plots_knn\knn_k5_roc_curve.png
L'AUC (Area Under the Curve) est de : 0.9699


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report, accuracy_score, f1_score
from collections import Counter
import time
import os

# --- CONFIGURATION ---
CHEMIN_FICHIER_EQUILIBRE = 'C:\\Users\\hp\\Desktop\\TPs\\DataMining\\preparing\\Final_Reduit30_70.csv'
COLONNE_CIBLE = 'classe'
K_VOISINS = 1
TEST_SIZE = 0.2
# ---------------------

class KNNVectorized:
    """Impl√©mentation du classifieur k-Nearest Neighbors (k-NN) optimis√© par vectorisation NumPy."""
    
    def __init__(self, k=5):
        self.k = k
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        """Stocke l'ensemble d'entra√Ænement et le convertit en tableaux NumPy."""
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def _euclidean_distance(self, X_test):
        """Calcule la distance Euclidienne matricielle."""
        X_test_sq = np.sum(X_test**2, axis=1, keepdims=True)
        X_train_sq = np.sum(self.X_train**2, axis=1)
        dot_product = np.dot(X_test, self.X_train.T)
        sq_distances = X_test_sq + X_train_sq - 2 * dot_product
        return np.sqrt(np.maximum(sq_distances, 0))

    def _get_k_nearest_labels(self, distances_matrix):
        """Retourne les √©tiquettes des k plus proches voisins."""
        k_nearest_indices = np.argsort(distances_matrix, axis=1)[:, :self.k]
        return self.y_train[k_nearest_indices]

    def predict(self, X_test):
        """Pr√©dit la classe de chaque point (vote majoritaire)."""
        X_test = np.array(X_test)
        distances_matrix = self._euclidean_distance(X_test)
        k_nearest_labels = self._get_k_nearest_labels(distances_matrix)
        predictions = np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=1, arr=k_nearest_labels)
        return predictions

    def predict_proba(self, X_test):
        """Estime les probabilit√©s de la classe 1 (Feu)."""
        X_test = np.array(X_test)
        distances_matrix = self._euclidean_distance(X_test)
        k_nearest_labels = self._get_k_nearest_labels(distances_matrix)
        proba_class_1 = np.mean(k_nearest_labels == 1, axis=1)
        proba_class_0 = 1 - proba_class_1
        return np.column_stack((proba_class_0, proba_class_1))


# --- Ex√©cution Principale pour le Diagnostic ---

def run_knn_overfitting_diagnosis(k=K_VOISINS):
    try:
        # 1. Pr√©paration des Donn√©es
        print("Chargement des donn√©es et standardisation...")
        df_final = pd.read_csv(CHEMIN_FICHIER_EQUILIBRE)
        
        X = df_final.drop(columns=[COLONNE_CIBLE, 'latitude', 'longitude'], errors='ignore') 
        y = df_final[COLONNE_CIBLE] 

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=TEST_SIZE, random_state=42, stratify=y
        )
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # 2. Entra√Ænement et Pr√©dictions
        
        knn_model_vectorized = KNNVectorized(k=k)
        knn_model_vectorized.fit(X_train_scaled, y_train.values)
        
        # --- Pr√©dictions sur l'ENSEMBLE D'ENTRA√éNEMENT (Diagnostic) ---
        print("\nCalcul des m√©triques sur l'entra√Ænement...")
        start_time_train = time.time()
        y_pred_train = knn_model_vectorized.predict(X_train_scaled)
        y_proba_train = knn_model_vectorized.predict_proba(X_train_scaled)[:, 1]
        time_train = time.time() - start_time_train
        
        # --- Pr√©dictions sur l'ENSEMBLE DE TEST (Validation) ---
        print("Calcul des m√©triques sur le test...")
        start_time_test = time.time()
        y_pred_test = knn_model_vectorized.predict(X_test_scaled)
        y_proba_test = knn_model_vectorized.predict_proba(X_test_scaled)[:, 1]
        time_test = time.time() - start_time_test

        
        # 3. Calcul des M√©triques
        
        # M√©triques d'Entra√Ænement
        f1_train = f1_score(y_train, y_pred_train, pos_label=1)
        acc_train = accuracy_score(y_train, y_pred_train)
        auc_train = auc(roc_curve(y_train, y_proba_train)[0], roc_curve(y_train, y_proba_train)[1])
        
        # M√©triques de Test
        f1_test = f1_score(y_test, y_pred_test, pos_label=1)
        acc_test = accuracy_score(y_test, y_pred_test)
        auc_test = auc(roc_curve(y_test, y_proba_test)[0], roc_curve(y_test, y_proba_test)[1])
        
        
        # 4. Affichage et Conclusion
        
        print("\n" + "="*70)
        print(f"  DIAGNOSTIC D'OVERFITTING POUR k-NN (k={k})")
        print("="*70)
        
        print(f"{'METRIQUE':<20} {'ENSEMBLE TRAIN':<20} {'ENSEMBLE TEST':<20} {'DIFFERENCE':<10}")
        print("-" * 70)
        
        # Ligne Accuracy
        acc_diff = acc_train - acc_test
        print(f"{'Accuracy Globale':<20} {acc_train*100:.2f}% ({time_train:.2f}s) {acc_test*100:.2f}% ({time_test:.2f}s) {acc_diff*100:.2f}%")
        
        # Ligne F1-score (Classe 1)
        f1_diff = f1_train - f1_test
        print(f"{'F1-score (Feu)':<20} {f1_train:.4f} {f1_test:.4f} {f1_diff:.4f}")

        # Ligne AUC
        auc_diff = auc_train - auc_test
        print(f"{'AUC':<20} {auc_train:.4f} {auc_test:.4f} {auc_diff:.4f}")
        print("-" * 70)
        
        # 5. Conclusion sur l'Overfitting
        print("\n### üßê Conclusion du Diagnostic ###")
        
        if f1_diff > 0.05 or auc_diff > 0.03: # Seuil typique de d√©tection
            print(f"üö© ALERTE OVERFITTING : La diff√©rence d'AUC est de {auc_diff:.4f} et de F1-score est de {f1_diff:.4f}.")
            print("Le mod√®le est trop complexe (k est trop petit ou les donn√©es sont bruit√©es) et m√©morise l'ensemble d'entra√Ænement.")
            print("Action recommand√©e : Augmenter la valeur de k et relancer la Grid Search.")
        elif 0.01 <= f1_diff <= 0.05:
             print(f"‚ö†Ô∏è FAIBLE TENDANCE √Ä L'OVERFITTING : La performance est l√©g√®rement meilleure sur l'entra√Ænement.")
             print("Ceci est normal. L'optimisation (Grid Search) devrait trouver le meilleur compromis.")
        else:
            print("‚úÖ BON AJUSTEMENT (GOOD FIT) : La performance sur les ensembles de Train et Test est tr√®s similaire.")
            print("Le mod√®le g√©n√©ralise bien, et la haute performance (AUC‚âà0.97) semble robuste.")


    except FileNotFoundError:
        print(f"‚ùå Erreur : Le fichier d'entr√©e n'a pas √©t√© trouv√©.")
    except Exception as e:
        print(f"\n‚ùå Une erreur inattendue est survenue : {e}")

# Lancer la fonction
run_knn_overfitting_diagnosis(k=K_VOISINS)

Chargement des donn√©es et standardisation...

Calcul des m√©triques sur l'entra√Ænement...
Calcul des m√©triques sur le test...

  DIAGNOSTIC D'OVERFITTING POUR k-NN (k=1)
METRIQUE             ENSEMBLE TRAIN       ENSEMBLE TEST        DIFFERENCE
----------------------------------------------------------------------
Accuracy Globale     99.97% (1427.39s) 93.54% (31.79s) 6.43%
F1-score (Feu)       0.9995 0.8946 0.1049
AUC                  0.9996 0.9292 0.0704
----------------------------------------------------------------------

### üßê Conclusion du Diagnostic ###
üö© ALERTE OVERFITTING : La diff√©rence d'AUC est de 0.0704 et de F1-score est de 0.1049.
Le mod√®le est trop complexe (k est trop petit ou les donn√©es sont bruit√©es) et m√©morise l'ensemble d'entra√Ænement.
Action recommand√©e : Augmenter la valeur de k et relancer la Grid Search.
