In [None]:
!pip install --upgrade transformers accelerate

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.57.2
    Uninstalling transformers-4.57.2:
      Successfully uninstalled transformers-4.57.2
Successfully installed transformers-4.57.3


In [None]:
!pip install transformers



In [29]:
import pandas as pd
import os

def load_and_flatten_tsv(path: str) -> str:
    try:
        df_tsv = pd.read_csv(path, sep="\t")
    except FileNotFoundError:
        return f"[Fichier introuvable: {path}]"

    # Conversion du tableau en texte : chaque ligne devient une phrase
    rows = []
    for _, row in df_tsv.iterrows():
        # Format : "col1: val1 | col2: val2 | ..."
        row_text = " | ".join(f"{col}: {row[col]}" for col in df_tsv.columns)
        rows.append(row_text)

    # Contexte final : concaténation
    return "\n".join(rows)


# --- Chargement du DataFrame principal ---
df = pd.read_csv("subset_labeled.tsv", sep='\t')  # ou ton fichier principal

# Le répertoire racine du dataset (où se trouvent les TSV)
#base_dir = os.path.dirname("train.csv")  # à adapter si nécessaire

# --- Traitement de la colonne "context" ---
flattened_contexts = []

for rel_path in df["context"]:
    tsv_path = rel_path.replace(".csv", ".tsv")
    tsv_path = os.path.abspath(tsv_path)

    text_context = load_and_flatten_tsv(tsv_path)
    flattened_contexts.append(text_context)

df["context"] = flattened_contexts
print("file read\n")
# disply first row

file read



'Ceremony: 2011 Edison Awards | Award: Edison Award | Category: Best Romantic Movie | Name: nan | Outcome: Won\nCeremony: 2011 Vijay Music Awards | Award: Vijay Music Award | Category: Best Folk Song of the Year 2010 | Name: Adida Nayandiya | Outcome: Nominated\nCeremony: 2011 Vijay Music Awards | Award: Vijay Music Award | Category: Popular Melody of the Year 2010 | Name: Idhu Varai | Outcome: Nominated\nCeremony: 2011 Vijay Music Awards | Award: Vijay Music Award | Category: Popular Duet of the Year 2010 | Name: Andrea Jeremiah & Ajeesh for Idhu Varai | Outcome: Won\nCeremony: 2011 Vijay Music Awards | Award: Vijay Music Award | Category: Popular Female Singer of the Year 2010 | Name: Andrea Jeremiah for Idhu Varai | Outcome: Nominated\nCeremony: 2011 Vijay Music Awards | Award: Vijay Music Award | Category: Best Debut Male Playback Singer (Jury) | Name: Ajeesh | Outcome: Nominated\nCeremony: 5th Vijay Awards | Award: Vijay Award | Category: Vijay Award for Best Supporting Actor | Na

In [35]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification
)


df = pd.read_csv("subset_labeled.tsv", sep="\t")
print("Fichier principal chargé :", df.shape)


def load_and_flatten_context(path):
    if pd.isna(path):
        return ""

    path = path.strip()

    if not os.path.exists(path):
        print(f"[WARNING] Chemin introuvable : {path}")
        return ""

    try:
        ctx_df = pd.read_csv(path, sep="\t")
        # On aplatit en "col1: val1 ; col2: val2 ; ..."
        flattened_rows = []
        for _, row in ctx_df.iterrows():
            txt = " ; ".join([f"{col}: {row[col]}" for col in ctx_df.columns])
            flattened_rows.append(txt)

        return " || ".join(flattened_rows)

    except Exception as e:
        print(f"[ERROR] Impossible de lire {path}: {e}")
        return ""

print("Aplatissement des contextes…")
df["context"] = df["context"].str.replace(".csv", ".tsv")
df["flat_context"] = df["context"].apply(load_and_flatten_context)
df["full_text"] = df["utterance"] + " | CONTEXT: " + df["flat_context"]

print("Contexte aplati ajouté au dataframe.\n")



train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

print("Train:", train_df.shape, "Test:", test_df.shape)


class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.texts = df["full_text"].tolist()
        self.labels = df["label"].astype("category")
        self.label_ids = self.labels.cat.codes
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_map = dict(enumerate(self.labels.cat.categories))

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.label_ids.iloc[idx], dtype=torch.long)
        return item



tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_ds = TextDataset(train_df, tokenizer)
test_ds = TextDataset(test_df, tokenizer)

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=8)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(train_df["label"].unique()),
    problem_type="single_label_classification"
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)



# Entraînement et eval

EPOCHS = 40

for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0

    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"[Epoch {epoch+1}] Loss moyen = {epoch_loss/len(train_loader):.4f}")


model.eval()
preds, gold = [], []

with torch.no_grad():
    for batch in test_loader:
        labels = batch["labels"]
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        logits = outputs.logits.cpu()

        preds.extend(logits.argmax(dim=1).tolist())
        gold.extend(labels.tolist())

label_names = train_df["label"].astype("category").cat.categories
print("\n=== Classification Report ===")
print(classification_report(gold, preds, target_names=label_names))

model.save_pretrained("semantic_classifier_distilbert")
tokenizer.save_pretrained("semantic_classifier_distilbert")

Fichier principal chargé : (100, 5)
Aplatissement des contextes…
Contexte aplati ajouté au dataframe.

Train: (80, 7) Test: (20, 7)
Device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Epoch 1] Loss moyen = 1.9269
[Epoch 2] Loss moyen = 1.7804
[Epoch 3] Loss moyen = 1.5857
[Epoch 4] Loss moyen = 1.3377
[Epoch 5] Loss moyen = 1.0113
[Epoch 6] Loss moyen = 0.6938
[Epoch 7] Loss moyen = 0.4478
[Epoch 8] Loss moyen = 0.2799
[Epoch 9] Loss moyen = 0.1583
[Epoch 10] Loss moyen = 0.0940
[Epoch 11] Loss moyen = 0.0707
[Epoch 12] Loss moyen = 0.0551
[Epoch 13] Loss moyen = 0.0418
[Epoch 14] Loss moyen = 0.0356
[Epoch 15] Loss moyen = 0.0305
[Epoch 16] Loss moyen = 0.0274
[Epoch 17] Loss moyen = 0.0239
[Epoch 18] Loss moyen = 0.0217
[Epoch 19] Loss moyen = 0.0197
[Epoch 20] Loss moyen = 0.0171
[Epoch 21] Loss moyen = 0.0162
[Epoch 22] Loss moyen = 0.0143
[Epoch 23] Loss moyen = 0.0133
[Epoch 24] Loss moyen = 0.0121
[Epoch 25] Loss moyen = 0.0118
[Epoch 26] Loss moyen = 0.0110
[Epoch 27] Loss moyen = 0.0103
[Epoch 28] Loss moyen = 0.0101
[Epoch 29] Loss moyen = 0.0095
[Epoch 30] Loss moyen = 0.0085
[Epoch 31] Loss moyen = 0.0084
[Epoch 32] Loss moyen = 0.0076
[Epoch 33] Loss m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Modèle sauvegardé dans semantic_classifier_distilbert/
