In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset-pre-traitement-sift-bovw-pca/y_labels.npy
/kaggle/input/dataset-pre-traitement-sift-bovw-pca/image_names.pkl
/kaggle/input/dataset-pre-traitement-sift-bovw-pca/label_names.pkl
/kaggle/input/dataset-pre-traitement-sift-bovw-pca/y.npy
/kaggle/input/dataset-pre-traitement-sift-bovw-pca/X_pca.npy


In [3]:
import numpy as np
import pickle
import time
import warnings
import pandas as pd
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, hamming_loss,
    classification_report, multilabel_confusion_matrix
)

In [4]:
# -------------------------------
# 1️ Chargement des données
# -------------------------------
X = np.load("/kaggle/input/dataset-pre-traitement-sift-bovw-pca/X_pca.npy")
y = np.load("/kaggle/input/dataset-pre-traitement-sift-bovw-pca/y.npy")

with open("/kaggle/input/dataset-pre-traitement-sift-bovw-pca/image_names.pkl", "rb") as f:
    label_names = pickle.load(f)

print(X.shape, y.shape)

(8091, 100) (8091, 495)


In [5]:
# -------------------------------
# 2️ Standardisation
# -------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -------------------------------
# 3️ Division Train/Test
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(f" X_train : {X_train.shape} | X_test : {X_test.shape}")
print(f" X_train : {y_train.shape} | y_test : {X_test.shape}")

 X_train : (6472, 100) | X_test : (1619, 100)
 X_train : (6472, 495) | y_test : (1619, 100)


In [7]:
# -------------------------------
# 4️ Configuration Knn
# -------------------------------
base_knn = KNeighborsClassifier(n_jobs=1)
knn_model = OneVsRestClassifier(base_knn)

param_grid = [{
    "estimator__n_neighbors": [5, 7],
    "estimator__weights": ["distance"],
    "estimator__metric": ["euclidean"],
    "estimator__p": [1, 2],
    "estimator__algorithm": ["brute"]
}]

grid_search = GridSearchCV(
    estimator=knn_model,
    param_grid=param_grid,
    cv=8,
    scoring="f1_macro",
    verbose=2,
    n_jobs=1,
    refit=True
)


In [8]:
# -------------------------------
# 5️ Entraînement
# -------------------------------
print("\n DÉMARRAGE DE L'ENTRAÎNEMENT GRID SEARCH")
start = time.time()
try:
    grid_search.fit(X_train, y_train)
    best_knn = grid_search.best_estimator_
    print("Entraînement réussi avec GridSearchCV")
except Exception as e:
    print(f" Erreur pendant le GridSearch : {e}")
    print(" Passage à une configuration simple par défaut.")
    base_knn = KNeighborsClassifier(
        n_neighbors=5,
        weights="distance",
        metric="euclidean",
        n_jobs=-1
    )
    best_knn = OneVsRestClassifier(base_knn)
    best_knn.fit(X_train, y_train)

end = time.time()
print(f" Entraînement terminé en {end - start:.2f} secondes")


 DÉMARRAGE DE L'ENTRAÎNEMENT GRID SEARCH
Fitting 8 folds for each of 4 candidates, totalling 32 fits
[CV] END estimator__algorithm=brute, estimator__metric=euclidean, estimator__n_neighbors=5, estimator__p=1, estimator__weights=distance; total time=  18.6s
[CV] END estimator__algorithm=brute, estimator__metric=euclidean, estimator__n_neighbors=5, estimator__p=1, estimator__weights=distance; total time=  18.4s
[CV] END estimator__algorithm=brute, estimator__metric=euclidean, estimator__n_neighbors=5, estimator__p=1, estimator__weights=distance; total time=  19.0s
[CV] END estimator__algorithm=brute, estimator__metric=euclidean, estimator__n_neighbors=5, estimator__p=1, estimator__weights=distance; total time=  18.7s
[CV] END estimator__algorithm=brute, estimator__metric=euclidean, estimator__n_neighbors=5, estimator__p=1, estimator__weights=distance; total time=  18.2s
[CV] END estimator__algorithm=brute, estimator__metric=euclidean, estimator__n_neighbors=5, estimator__p=1, estimator_

In [9]:
# -------------------------------
# 6️ Évaluation
# -------------------------------
y_pred = best_knn.predict(X_test)

metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision_micro": precision_score(y_test, y_pred, average="micro", zero_division=0),
    "Recall_micro": recall_score(y_test, y_pred, average="micro", zero_division=0),
    "F1_macro": f1_score(y_test, y_pred, average="macro", zero_division=0),
    "F1_micro": f1_score(y_test, y_pred, average="micro", zero_division=0),
    "Hamming_Loss": hamming_loss(y_test, y_pred)
}

print("\n MÉTRIQUES DU MODÈLE KNN :")
for k, v in metrics.items():
    print(f"{k:20s} : {v:.4f}")


 MÉTRIQUES DU MODÈLE KNN :
Accuracy             : 0.0006
Precision_micro      : 0.2235
Recall_micro         : 0.0162
F1_macro             : 0.0015
F1_micro             : 0.0302
Hamming_Loss         : 0.0091
