In [3]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



In [4]:
# ================================================================================
# CELL 1 - CHARGEMENT DES DONNEES
# ================================================================================

import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import cv2
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer, f1_score, hamming_loss, accuracy_score, recall_score, precision_score
from sklearn.cluster import KMeans as SegmentationKMeans
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter

print("Loading data...")
X = np.load("/kaggle/input/dataset-pre-traitement-sift-bovw-pca/X_pca.npy")
y = np.load("/kaggle/input/dataset-pre-traitement-sift-bovw-pca/y.npy")

with open("/kaggle/input/dataset-pre-traitement-sift-bovw-pca/label_names.pkl", "rb") as f:
    label_names = pickle.load(f)

print(f"Data loaded successfully!")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Number of labels: {len(label_names)}")

Loading data...
Data loaded successfully!
X shape: (8091, 100)
y shape: (8091, 495)
Number of labels: 495


In [5]:
# ================================================================================
# CELL 2 - DIVISION TRAIN / TEST
# ================================================================================

# Utiliser les variables de la cellule 1
# X, y, label_names doivent être chargés

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Number of features: {X_train.shape[1]}")

Training set size: 6472
Test set size: 1619
Number of features: 100


In [None]:
# ================================================================================
# CELL 3 - NORMALISATION ET ENTRAÎNEMENT
# ================================================================================


import time

print("Normalizing features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Features normalized!")

print("\n" + "="*60)
print("STARTING GRIDSEARCH")
print("="*60)

gb_classifier = GradientBoostingClassifier(
    random_state=42,
    verbose=0
)

ovr_model = OneVsRestClassifier(gb_classifier, n_jobs=-1)

param_grid = {
    'estimator__n_estimators': [200],
    'estimator__max_depth': [6],
    'estimator__learning_rate': [0.08]
}

f1_scorer = make_scorer(f1_score, average='samples', zero_division=0)

grid_search = GridSearchCV(
    estimator=ovr_model,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=5,
    verbose=2,
    n_jobs=1,
    return_train_score=True
)

print("\nStarting GridSearch...")
start_time = time.time()
grid_search.fit(X_train_scaled, y_train)
train_time = time.time() - start_time
print("GridSearch completed!")

model = grid_search.best_estimator_

print("\n" + "="*60)
print("BEST PARAMETERS FOUND")
print("="*60)
for param, value in grid_search.best_params_.items():
    print(f"{param}: {value}")
print(f"\nBest CV F1 Score: {grid_search.best_score_:.4f}")
print(f"Training time: {train_time/60:.2f} minutes")

Normalizing features...
Features normalized!

STARTING GRIDSEARCH

Starting GridSearch...
Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [None]:
# CELL 4 - EVALUATION INITIALE
# ================================================================================

# Utiliser les variables des cellules précédentes
# model, X_test_scaled, y_test doivent être définis

print("Making predictions...")
y_pred_initial = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred_initial)
f1 = f1_score(y_test, y_pred_initial, average='samples', zero_division=0)
recall = recall_score(y_test, y_pred_initial, average='samples', zero_division=0)
precision = precision_score(y_test, y_pred_initial, average='samples', zero_division=0)
hamming = hamming_loss(y_test, y_pred_initial)

print("\n" + "="*60)
print("INITIAL EVALUATION")
print("="*60)
print(f"\nAccuracy: {accuracy:.4f}")
print(f"F1 Score (samples): {f1:.4f}")
print(f"Recall (samples): {recall:.4f}")
print(f"Precision (samples): {precision:.4f}")
print(f"Hamming Loss: {hamming:.4f}")

f1_macro = f1_score(y_test, y_pred_initial, average='macro', zero_division=0)
f1_micro = f1_score(y_test, y_pred_initial, average='micro', zero_division=0)
print(f"\nF1 Score (macro): {f1_macro:.4f}")
print(f"F1 Score (micro): {f1_micro:.4f}")

In [None]:
# ================================================================================
# CELL 5 - OPTIMISATION DU SEUIL ET PROBABILITES
# ================================================================================

# Utiliser les variables des cellules précédentes
# model, X_test_scaled, y_test, label_names doivent être définis

print("Initializing MultiLabelBinarizer...")
mlb = MultiLabelBinarizer()
mlb.classes_ = np.array(label_names)

def predict_with_threshold(model, X, threshold=0.3):
    n_samples = X.shape[0]
    n_labels = len(model.estimators_)
    y_proba = np.zeros((n_samples, n_labels))
    
    for i, est in enumerate(model.estimators_):
        if hasattr(est, 'predict_proba'):
            proba = est.predict_proba(X)
            if proba.shape[1] == 2:
                y_proba[:, i] = proba[:, 1]
            else:
                y_proba[:, i] = proba[:, 0]
    
    y_pred = (y_proba >= threshold).astype(int)
    return y_pred, y_proba

print("\n" + "="*60)
print("FINDING BEST THRESHOLD")
print("="*60)

thresholds = [0.2, 0.3, 0.4, 0.5]
best_threshold = 0.3
best_recall = 0
results = []

for t in thresholds:
    y_pred_t, _ = predict_with_threshold(model, X_test_scaled, threshold=t)
    r = recall_score(y_test, y_pred_t, average='samples', zero_division=0)
    p = precision_score(y_test, y_pred_t, average='samples', zero_division=0)
    f = f1_score(y_test, y_pred_t, average='samples', zero_division=0)
    results.append((t, r, p, f))
    
    print(f"Threshold {t}: Recall={r:.4f}, Precision={p:.4f}, F1={f:.4f}")
    
    if r > best_recall:
        best_recall = r
        best_threshold = t

print(f"\nBest threshold: {best_threshold} with recall {best_recall:.4f}")

print("\nComputing probabilities with best threshold...")
y_pred, y_proba = predict_with_threshold(model, X_test_scaled, threshold=best_threshold)
print("Probabilities computed!")

In [None]:
 #================================================================================
# CELL 6 - GENERATION DE PHRASES ET AFFICHAGE
# ================================================================================

# Utiliser les variables des cellules précédentes
# y_pred, y_proba, y_test, label_names, mlb doivent être définis

def generate_sentence_from_keywords(keywords, proba=None):
    if len(keywords) == 0:
        return "No keywords detected"
    
    if proba is not None:
        keywords = [kw for kw, _ in sorted(zip(keywords, proba), key=lambda x: x[1], reverse=True)]
    
    if len(keywords) == 1:
        return f"An image of {keywords[0]}"
    elif len(keywords) == 2:
        return f"An image showing {keywords[0]} and {keywords[1]}"
    elif len(keywords) == 3:
        return f"An image of {keywords[0]} with {keywords[1]} and {keywords[2]}"
    else:
        main_kw = ', '.join(keywords[:3])
        return f"An image featuring {main_kw} and {keywords[3]}"

IMAGE_DIR = "/kaggle/input/flickr8k/Images"
valid_image_names = os.listdir(IMAGE_DIR)

def predict_and_display_improved(idx, X_test, y_test, y_pred, y_proba):
    pred_idx = np.where(y_pred[idx] == 1)[0]
    pred_labels = mlb.classes_[pred_idx]
    pred_probs = y_proba[idx][pred_idx]
    
    true_idx = np.where(y_test[idx] == 1)[0]
    true_labels = mlb.classes_[true_idx]
    
    img_name = valid_image_names[idx]
    img_path = os.path.join(IMAGE_DIR, img_name)
    img = cv2.imread(img_path)
    if img is None:
        print(f"Cannot load image {img_path}")
        return
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    caption = generate_sentence_from_keywords(list(pred_labels), pred_probs)
    
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.imshow(img_rgb)
    plt.axis('off')
    plt.title(img_name, fontsize=12, fontweight='bold')
    
    plt.subplot(1, 2, 2)
    plt.axis('off')
    text = f"PREDICTED KEYWORDS:\n"
    for kw, p in zip(pred_labels, pred_probs):
        text += f" • {kw} ({p:.2f})\n"
    text += f"\nGENERATED CAPTION:\n{caption}\n"
    plt.text(0, 0.5, text, fontsize=11, verticalalignment='center',
             bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.7))
    plt.tight_layout()
    plt.show()
    
    return pred_labels, caption, true_labels

print("\n" + "="*60)
print("PREDICTIONS ON TEST IMAGES")
print("="*60)

for i in range(min(5, len(X_test))):
    print(f"\nImage {i+1}:")
    predict_and_display_improved(i, X_test_scaled, y_test, y_pred, y_proba)

In [None]:
# ================================================================================
# CELL 7 - SEGMENTATION D'IMAGE
# ================================================================================

# Utiliser les variables des cellules précédentes

def segment_image_kmeans(image_path, n_segments=5, show_result=True):
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: cannot load {image_path}")
        return None, None, None
    
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    h, w, c = image_rgb.shape
    pixels = image_rgb.reshape(-1, 3)
    pixels_normalized = pixels.astype(np.float32) / 255.0
    
    print(f"Segmentation with K-means (k={n_segments})...")
    kmeans_seg = SegmentationKMeans(
        n_clusters=n_segments,
        random_state=42,
        n_init=10,
        max_iter=300
    )
    
    labels = kmeans_seg.fit_predict(pixels_normalized)
    centers = kmeans_seg.cluster_centers_
    
    segmented_pixels = centers[labels]
    segmented_image = (segmented_pixels * 255).reshape(h, w, c).astype(np.uint8)
    labels_image = labels.reshape(h, w)
    
    if show_result:
        fig, axes = plt.subplots(1, 3, figsize=(18, 6))
        
        axes[0].imshow(image_rgb)
        axes[0].set_title('Original Image', fontsize=14, fontweight='bold')
        axes[0].axis('off')
        
        axes[1].imshow(segmented_image)
        axes[1].set_title(f'K-means Segmentation (k={n_segments})', fontsize=14, fontweight='bold')
        axes[1].axis('off')
        
        im = axes[2].imshow(labels_image, cmap='tab20')
        axes[2].set_title('Segmented Regions', fontsize=14, fontweight='bold')
        axes[2].axis('off')
        plt.colorbar(im, ax=axes[2], fraction=0.046)
        
        plt.tight_layout()
        plt.show()
        
        unique_labels, counts = np.unique(labels, return_counts=True)
        print(f"\nNumber of segments: {len(unique_labels)}")
        print("\nSegment sizes (in pixels):")
        for label, count in zip(unique_labels, counts):
            percentage = (count / len(labels)) * 100
            print(f"  Segment {label}: {count} pixels ({percentage:.2f}%)")
    
    return segmented_image, labels_image, centers

print("\n" + "="*60)
print("IMAGE SEGMENTATION")
print("="*60)

seg_img1, labels1, centers1 = segment_image_kmeans(
    "/kaggle/input/flickr8k/Images/1022454332_6af2c1449a.jpg", 
    n_segments=5, 
    show_result=True
)

In [None]:
# ================================================================================
# CELL 8 - METRIQUES BLEU ET CIDER
# ================================================================================

# Utiliser les variables des cellules précédentes
# y_pred, y_test, mlb, label_names doivent être définis

def calculate_bleu_scores(ref, hyp):
    def ngrams(tokens, n):
        return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    scores = []
    for n in range(1, 5):
        ref_ngrams = ngrams(ref, n)
        hyp_ngrams = ngrams(hyp, n)
        if len(hyp_ngrams) == 0:
            scores.append(0)
        else:
            overlap = sum(1 for g in hyp_ngrams if g in ref_ngrams)
            scores.append(overlap / len(hyp_ngrams))
    return scores

def calculate_cider_score(ref_kw, hyp_kw):
    if len(ref_kw) == 0 or len(hyp_kw) == 0:
        return 0
    
    ref_counter = Counter(ref_kw)
    hyp_counter = Counter(hyp_kw)
    all_words = set(ref_kw) | set(hyp_kw)
    
    ref_vec = {w: ref_counter.get(w, 0) for w in all_words}
    hyp_vec = {w: hyp_counter.get(w, 0) for w in all_words}
    
    dot = sum(ref_vec[w] * hyp_vec[w] for w in all_words)
    norm = (sum(v*v for v in ref_vec.values())**0.5) * (sum(v*v for v in hyp_vec.values())**0.5)
    
    return (dot / norm) * 10 if norm > 0 else 0

print("\n" + "="*60)
print("COMPUTING BLEU AND CIDER SCORES")
print("="*60)

bleu1_scores, bleu2_scores, bleu3_scores, bleu4_scores, cider_scores = [], [], [], [], []

for i in range(len(X_test)):
    pred_idx = np.where(y_pred[i] == 1)[0]
    pred_kw = list(mlb.classes_[pred_idx])
    true_idx = np.where(y_test[i] == 1)[0]
    true_kw = list(mlb.classes_[true_idx])
    
    b1, b2, b3, b4 = calculate_bleu_scores(true_kw, pred_kw)
    bleu1_scores.append(b1)
    bleu2_scores.append(b2)
    bleu3_scores.append(b3)
    bleu4_scores.append(b4)
    
    cider = calculate_cider_score(true_kw, pred_kw)
    cider_scores.append(cider)

print("\n" + "="*60)
print("BLEU AND CIDER SCORES")
print("="*60)
print(f"Average BLEU-1: {np.mean(bleu1_scores):.4f}")
print(f"Average BLEU-2: {np.mean(bleu2_scores):.4f}")
print(f"Average BLEU-3: {np.mean(bleu3_scores):.4f}")
print(f"Average BLEU-4: {np.mean(bleu4_scores):.4f}")
print(f"Average CIDEr: {np.mean(cider_scores):.4f}")

print("\nBLEU Scores Statistics:")
print(f"BLEU-1 - Min: {np.min(bleu1_scores):.4f}, Max: {np.max(bleu1_scores):.4f}, Std: {np.std(bleu1_scores):.4f}")
print(f"BLEU-2 - Min: {np.min(bleu2_scores):.4f}, Max: {np.max(bleu2_scores):.4f}, Std: {np.std(bleu2_scores):.4f}")
print(f"BLEU-3 - Min: {np.min(bleu3_scores):.4f}, Max: {np.max(bleu3_scores):.4f}, Std: {np.std(bleu3_scores):.4f}")
print(f"BLEU-4 - Min: {np.min(bleu4_scores):.4f}, Max: {np.max(bleu4_scores):.4f}, Std: {np.std(bleu4_scores):.4f}")
print(f"CIDEr  - Min: {np.min(cider_scores):.4f}, Max: {np.max(cider_scores):.4f}, Std: {np.std(cider_scores):.4f}")

In [None]:
# ================================================================================
# CELL 9 - SAUVEGARDE DU MODELE ET ARTIFACTS
# ================================================================================

# Utiliser les variables des cellules précédentes
# model, scaler, label_names doivent être définis

print("\n" + "="*60)
print("SAVING MODEL AND ARTIFACTS")
print("="*60)

with open("/kaggle/working/gb_model_sift.pkl", 'wb') as f:
    pickle.dump(model, f)
print("Model saved: /kaggle/working/gb_model_sift.pkl")

with open("/kaggle/working/scaler_sift.pkl", 'wb') as f:
    pickle.dump(scaler, f)
print("Scaler saved: /kaggle/working/scaler_sift.pkl")

with open("/kaggle/working/label_names_sift.pkl", 'wb') as f:
    pickle.dump(label_names, f)
print("Label names saved: /kaggle/working/label_names_sift.pkl")

print("\n" + "="*60)
print("PROCESS COMPLETED SUCCESSFULLY")
print("="*60)
print("\nAll models and artifacts have been saved to /kaggle/working/")
print("You can now use these files for predictions on new images.")