# Préparation des données

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/data_Odil/image_indexing.csv')

In [None]:
def metadata_col(df, metadata_cols):
    def format_metadata(row):
        parts = []
        for col in metadata_cols:
            value = row[col]
            if value is not None and isinstance(value, str) and value.strip() != "":
                parts.append(value.strip().lower())
        if parts:
            return "Métadonnées: " + "|".join(parts)
        else:
            return None

    df = df.copy()
    df["metadata"] = df.apply(format_metadata, axis=1)
    return df

In [None]:
df = metadata_col(df, ['timel_nature_place_term','timel_object_architecture_term', 'timel_character_term','timel_thema_term', 'date_created',])

In [None]:
def df_split_text(text):
    if isinstance(text, str) :
        text = text.lower()
        return text.split('|') if '|' in text else [text]
    else:
        return None

In [None]:
#Selection colonnes
colonnes = ['id','files','date_created','timel_nature_place_term','timel_subject_term','timel_object_architecture_term', 'timel_character_term','timel_thema_term', 'metadata']
df_sujet = df[colonnes].copy()
df_sujet = df_sujet[df_sujet['files'].str.lower().str.endswith('.jpg')]

#Créer des listes de descripteurs
colonnes_descripteurs = ['timel_nature_place_term','timel_subject_term','timel_object_architecture_term', 'timel_character_term','timel_thema_term']
df_sujet[colonnes_descripteurs] = df[colonnes_descripteurs].map(df_split_text)


In [None]:
# Nettoyage sujet
def clean_subject_list(subjects):
    if not isinstance(subjects, list):
        return []

    cleaned = []
    for item in subjects:
        if not isinstance(item, str):
            continue
        item = item.strip().replace('?', '')
        item_lower = item.lower()
        if any(x in item_lower for x in ['initiale ornée', 'psaume', 'psaumes']):
            continue
        if item:  # skip empty strings
            cleaned.append(item)
    return cleaned


In [None]:
# Créer une nouvelle colonne pour les sujets-labels
df_sujet['sujet'] = df_sujet['timel_subject_term'].apply(clean_subject_list)

#Enlever les cases vides après nettoyage
df_sujet = df_sujet[df_sujet['sujet'].apply(lambda x: len(x) > 0)]

In [None]:
#Garder lignes avec sujet unique
filtre_valeurs_uniques = df_sujet['sujet'].apply(lambda x: isinstance(x, list) and len(x) == 1)
df_sujet = df_sujet.loc[filtre_valeurs_uniques].copy()
df_sujet.loc[:, 'sujet'] = df_sujet['sujet'].apply(lambda x: x[0])

In [None]:
#Equilibrer en sélectionnant les sujets représentés au moins 4 fois
subject_counts = df_sujet['sujet'].value_counts()

filtre_sujets = subject_counts[subject_counts > 3 ].index

df_sujet_filtre = df_sujet[df_sujet['sujet'].isin(filtre_sujets)].reset_index(drop=True)

In [None]:
# Ajouter captions 

import json
# Créer un dataframe à partir des captions générées

# Captions générées avec Florence-2-base
#captions_path = '/content/drive/MyDrive/data_Odil/captions_detailed_f-2base.jsonl'

# Captions générées avec Florence-2-finetuned
captions_path = '/content/drive/MyDrive/data_Odil/captions_finetuned.jsonl'
captions={}
with open(captions_path, 'r', encoding='utf-8') as json_file:
    for line in json_file:
        entry = json.loads(line)
        captions.update(entry)

df_captions = pd.DataFrame(list(captions.items()), columns=["files", "detailed_caption"])
df_captions['detailed_caption']=df_captions['detailed_caption'].apply(lambda x: list(x.values())[0]) #extraire value dans dict caption

In [None]:
# Regroupe captions et le reste dans un df
df_sujets_captions = df_sujet_filtre.merge(df_captions[['files', 'detailed_caption']], 'left', on='files')

In [None]:
# Attribuer une classe aux sujets
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
label_col = 'sujet'
df_sujets_captions['class'] = encoder.fit_transform(df_sujets_captions[label_col])

## Encodage metadonnées

### Metadata pour xlm-roberta

In [None]:
#metadata pour Roberta

def caption_metadata(df, caption_col, metadata_col, sep_token="[SEP]"):
    def combine(row):
        caption = row[caption_col]
        metadata = row[metadata_col]
        if metadata:
            return f"{caption} {sep_token} {metadata}"
        else:
            return caption

    df = df.copy()
    df["captions_metadata"] = df.apply(combine, axis=1)
    return df

In [None]:
df_data = caption_metadata(df_sujets_captions, 'detailed_caption', 'metadata')

### Metadata pour Random Forest

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

def create_metadata_vector(df: pd.DataFrame, descriptor_cols: list) -> pd.DataFrame:
    df_copy = df.copy()

    def combine_terms(row):
        terms = []
        for col in descriptor_cols:
            val = row[col]
            if isinstance(val, list):
                terms.extend(val)
        return list(set(terms))  # unique terms

    df_copy['combined_terms'] = df_copy.apply(combine_terms, axis=1)

    mlb = MultiLabelBinarizer()
    mlb.fit(df_copy['combined_terms'])

    def vectorize_terms(terms):
        return mlb.transform([terms])[0]  #  1D np array

    df_copy['metadata_vec'] = df_copy['combined_terms'].apply(vectorize_terms)

    return df_copy, mlb

In [None]:
colonnes_metadata = ['timel_nature_place_term',
 'timel_object_architecture_term',
 'timel_character_term',
 'timel_thema_term']
df_data, mlb_data = create_metadata_vector(df_sujets_captions, colonnes_metadata)

# Classification

## Forêt aléatoire



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report


# Split
X = np.array(df_data['metadata_vec'].tolist())
y = df_data['class'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Entraînement
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Prédiction
y_pred = model.predict(X_test)

# metrics
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

print(classification_report(y_test, y_pred))

## XLM RoBERTa

In [None]:
pip install transformers==4.49.0

### Avec captions uniquement

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, XLMRobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.optim import AdamW

train_df, val_df = train_test_split(df_data, test_size=0.2, random_state=42, stratify=df_data['class'])

train_texts = train_df['detailed_caption'].tolist()
train_labels = train_df['class'].tolist()

val_texts = val_df['detailed_caption'].tolist()
val_labels = val_df['class'].tolist()

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')


In [None]:
class CaptionDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = CaptionDataset(train_texts, train_labels)
val_dataset = CaptionDataset(val_texts, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=16, num_workers=2)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = XLMRobertaForSequenceClassification.from_pretrained(
    'xlm-roberta-base',
    num_labels=len(set(df_data['class']))
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
best_model_path = "/content/drive/MyDrive/data_Odil/Models/f2-ftuned_xlm-Roberta-base.pth"
best_val_acc = 0.0

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm


epochs = 20

history = {
    'train_loss': [],
    'train_f1': [],
    'val_loss': [],
    'val_f1': [],
    'val_precision': [],
    'val_recall': [],
    'val_accuracy': [],
}

best_val_f1 = 0  

# Entraînement

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    model.train()
    total_loss = 0
    train_preds = []
    train_labels_all = []

    train_loop = tqdm(train_loader, desc="Training", leave=False)
    for batch in train_loop:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        train_preds.extend(preds.cpu().numpy())
        train_labels_all.extend(labels.cpu().numpy())

        train_loop.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_loader)
    train_f1 = f1_score(train_labels_all, train_preds, average='macro', zero_division=0)

    history['train_loss'].append(avg_train_loss)
    history['train_f1'].append(train_f1)

    # Validation 
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels_all = []

    val_loop = tqdm(val_loader, desc="Validation", leave=False)
    with torch.no_grad():
        for batch in val_loop:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)

            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            val_preds.extend(preds.cpu().numpy())
            val_labels_all.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_f1 = f1_score(val_labels_all, val_preds, average='macro', zero_division=0)
    val_precision = precision_score(val_labels_all, val_preds, average='macro', zero_division=0)
    val_recall = recall_score(val_labels_all, val_preds, average='macro', zero_division=0)
    val_acc = accuracy_score(val_labels_all, val_preds)

    history['val_loss'].append(avg_val_loss)
    history['val_f1'].append(val_f1)
    history['val_precision'].append(val_precision)
    history['val_recall'].append(val_recall)
    history['val_accuracy'].append(val_acc)

    print(f"Epoch {epoch+1} | "
          f"Train Loss: {avg_train_loss:.4f} | "
          f"Train F1: {train_f1:.4f} | "
          f"Val Loss: {avg_val_loss:.4f} | "
          f"Val F1: {val_f1:.4f} | "
          f"Val Precision: {val_precision:.4f} | "
          f"Val Recall: {val_recall:.4f} | "
          f"Val Acc: {val_acc:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), best_model_path)
        print(f"  -> Chemin vers nouveau meilleur modèle {best_model_path} (Val F1: {best_val_f1:.4f})")



In [None]:
with open('/content/drive/MyDrive/data_Odil/Metrics/f2-base_xlm-Roberta-base_training.json', 'w') as json_file:
    json.dump(history, json_file, indent=4)  

In [None]:
# Evaluer meilleur modèle
model.load_state_dict(torch.load(best_model_path))
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

f2_roberta_metrics = {
    "macro_f1": f1_score(all_labels, all_preds, average='macro', zero_division=0),
    "macro_precision": precision_score(all_labels, all_preds, average='macro', zero_division=0),
    "macro_recall": recall_score(all_labels, all_preds, average='macro', zero_division=0),
    "micro_f1": f1_score(all_labels, all_preds, average='micro', zero_division=0),
    "micro_precision": precision_score(all_labels, all_preds, average='micro', zero_division=0),
    "micro_recall": recall_score(all_labels, all_preds, average='micro', zero_division=0),
    "accuracy": accuracy_score(all_labels, all_preds)
}

In [None]:
with open('/content/drive/MyDrive/data_Odil/Metrics/f2-base_xlm-Roberta-base_evaluation.json', 'w') as json_file:
    json.dump(f2_roberta_metrics, json_file, indent=4)  

### Avec metadata

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, XLMRobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.optim import AdamW

train_df, val_df = train_test_split(df_data, test_size=0.2, random_state=42, stratify=df_data['class'])

train_texts = train_df['captions_metadata'].tolist()
train_labels = train_df['class'].tolist()

val_texts = val_df['captions_metadata'].tolist()
val_labels = val_df['class'].tolist()

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')


In [None]:
class CaptionDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = CaptionDataset(train_texts, train_labels)
val_dataset = CaptionDataset(val_texts, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=16, num_workers=2)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = XLMRobertaForSequenceClassification.from_pretrained(
    'xlm-roberta-base',
    num_labels=len(set(df_data['class']))
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
best_model_path = "/content/drive/MyDrive/data_Odil/Models/f2-ftuned_Roberta_metadata.pth"
best_val_acc = 0.0

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

# entraînement

epochs = 20

history = {
    'train_loss': [],
    'train_f1': [],
    'val_loss': [],
    'val_f1': [],
    'val_precision': [],
    'val_recall': [],
    'val_accuracy': [],
}

best_val_f1 = 0 

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    model.train()
    total_loss = 0
    train_preds = []
    train_labels_all = []

    train_loop = tqdm(train_loader, desc="Training", leave=False)
    for batch in train_loop:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        train_preds.extend(preds.cpu().numpy())
        train_labels_all.extend(labels.cpu().numpy())

        train_loop.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_loader)
    train_f1 = f1_score(train_labels_all, train_preds, average='macro', zero_division=0)

    history['train_loss'].append(avg_train_loss)
    history['train_f1'].append(train_f1)

    # Validation 
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels_all = []

    val_loop = tqdm(val_loader, desc="Validation", leave=False)
    with torch.no_grad():
        for batch in val_loop:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)

            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            val_preds.extend(preds.cpu().numpy())
            val_labels_all.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_f1 = f1_score(val_labels_all, val_preds, average='macro', zero_division=0)
    val_precision = precision_score(val_labels_all, val_preds, average='macro', zero_division=0)
    val_recall = recall_score(val_labels_all, val_preds, average='macro', zero_division=0)
    val_acc = accuracy_score(val_labels_all, val_preds)

    history['val_loss'].append(avg_val_loss)
    history['val_f1'].append(val_f1)
    history['val_precision'].append(val_precision)
    history['val_recall'].append(val_recall)
    history['val_accuracy'].append(val_acc)

    print(f"Epoch {epoch+1} | "
          f"Train Loss: {avg_train_loss:.4f} | "
          f"Train F1: {train_f1:.4f} | "
          f"Val Loss: {avg_val_loss:.4f} | "
          f"Val F1: {val_f1:.4f} | "
          f"Val Precision: {val_precision:.4f} | "
          f"Val Recall: {val_recall:.4f} | "
          f"Val Acc: {val_acc:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), best_model_path)
        print(f"  -> Chemin vers nouveau meilleur modèle {best_model_path} (Val F1: {best_val_f1:.4f})")



In [None]:

with open('/content/drive/MyDrive/data_Odil/Metrics/f2-base_xlm-Roberta_metadata_training.json', 'w') as json_file:
    json.dump(history, json_file, indent=4) 

In [None]:
# evaluation du meilleur modèle
model.load_state_dict(torch.load(best_model_path))
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

f2base_roberta_metadata_metrics = {
    "macro_f1": f1_score(all_labels, all_preds, average='macro', zero_division=0),
    "macro_precision": precision_score(all_labels, all_preds, average='macro', zero_division=0),
    "macro_recall": recall_score(all_labels, all_preds, average='macro', zero_division=0),
    "micro_f1": f1_score(all_labels, all_preds, average='micro', zero_division=0),
    "micro_precision": precision_score(all_labels, all_preds, average='micro', zero_division=0),
    "micro_recall": recall_score(all_labels, all_preds, average='micro', zero_division=0),
    "accuracy": accuracy_score(all_labels, all_preds)
}


In [None]:
with open('/content/drive/MyDrive/data_Odil/Metrics/f2-base_xlm-Roberta_metadata_evaluation.json', 'w') as json_file:
    json.dump(f2_roberta_metrics, json_file, indent=4) 