In [1]:
FIRST_RUN = False

In [2]:
import os
import pandas as pd
from tqdm import tqdm
import random

def build_texts_from_tsv_list(tsv_paths, base_dir="../../WikiTableQuestions/WikiTableQuestions"):
    all_texts = []
    all_ids = []
    all_rows = []   # <-- on garde aussi toutes les infos utiles

    for tsv_path in tsv_paths:
        df = pd.read_csv(
            os.path.join(base_dir, "data", tsv_path),
            sep="\t",
            dtype=str,
            engine="python",
            on_bad_lines="skip"
        ).fillna("")

        for _, row in tqdm(df.iterrows(), total=len(df),
                           desc=f"Construction des textes ({tsv_path})"):

            question = str(row["utterance"])
            context_path = row["context"].strip()
            full_path = os.path.join(base_dir, context_path)

            try:
                table_df = pd.read_csv(full_path, dtype=str, engine="python",
                                       on_bad_lines="skip").fillna("")

                table_header = " @ ".join(table_df.columns)
                table_header += "\n"

                table_lines = [table_header]
                for _, table_row in table_df.iterrows():
                    row_text = " @ ".join(table_row)
                    table_lines.append(f"{row_text} \n")

                full_text = " ".join(table_lines)

                all_texts.append(full_text)
                all_ids.append(row["id"])

                # on garde aussi les vraies colonnes brutes pour l’annotation
                all_rows.append({
                    "id": row["id"],
                    "utterance": row["utterance"],
                    "context": row["context"],
                    "targetValue": row["targetValue"],
                    "table": full_text
                })

            except Exception as e:
                print(f"Erreur lors du chargement de {full_path}: {e}")
                continue

    print(f"{len(all_texts)} exemples valides construits.")
    return all_rows  # <-- on retourne directement les données complètes


In [None]:
if FIRST_RUN:
	train_rows = build_texts_from_tsv_list(["training.tsv"])
	subset_train = random.sample(train_rows, 200)
	df_train = pd.DataFrame(subset_train)
	df_train.to_csv("subset_training.tsv", sep="\t", index=False)

	test_rows  = build_texts_from_tsv_list([
		"random-split-1-dev.tsv", "random-split-2-dev.tsv",
		"random-split-3-dev.tsv", "random-split-4-dev.tsv",
		"random-split-5-dev.tsv"
	])
	subset_test  = random.sample(test_rows, 200)
	df_test  = pd.DataFrame(subset_test)
	df_test.to_csv("subset_testing.tsv", sep="\t", index=False)

Construction des textes (training.tsv): 100%|██████████| 14111/14111 [02:36<00:00, 89.89it/s] 


14111 exemples valides construits.


Construction des textes (random-split-1-dev.tsv): 100%|██████████| 2806/2806 [00:31<00:00, 88.61it/s] 
Construction des textes (random-split-2-dev.tsv): 100%|██████████| 2833/2833 [00:32<00:00, 86.65it/s] 
Construction des textes (random-split-3-dev.tsv): 100%|██████████| 2830/2830 [00:33<00:00, 84.06it/s] 
Construction des textes (random-split-4-dev.tsv): 100%|██████████| 2820/2820 [00:32<00:00, 87.96it/s] 
Construction des textes (random-split-5-dev.tsv): 100%|██████████| 2825/2825 [00:31<00:00, 88.44it/s] 

14114 exemples valides construits.





In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib

df = pd.read_csv("subset_labeled.tsv", sep="\t")

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

X_train = train_df["utterance"]+" - "+train_df["context"]
y_train = train_df["label"]

X_test = test_df["utterance"]+" - "+test_df["context"]
y_test = test_df["label"]

tfidf = TfidfVectorizer(stop_words=None, ngram_range=(2,3))	 
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

clf = LogisticRegression(max_iter=3000)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred, digits=2))
print(confusion_matrix(y_test, y_pred))
joblib.dump((tfidf, clf), "semantic_classifier_tfidf.joblib")


test size: (20, 1635)
              precision    recall  f1-score   support

         AGG       0.21      0.60      0.32         5
        ARTH       0.00      0.00      0.00         3
        COMP       0.00      0.00      0.00         3
      LOOKUP       0.00      0.00      0.00         4
        Next       0.00      0.00      0.00         1
       SUPER       1.00      0.25      0.40         4

    accuracy                           0.20        20
   macro avg       0.20      0.14      0.12        20
weighted avg       0.25      0.20      0.16        20

[[3 0 0 2 0 0]
 [3 0 0 0 0 0]
 [2 0 0 1 0 0]
 [3 1 0 0 0 0]
 [1 0 0 0 0 0]
 [2 0 0 1 0 1]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


['semantic_classifier_tfidf.joblib']

In [14]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW

class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.texts = (df["utterance"]+" - "+df["context"]).tolist()
        self.labels = df["label"].astype("category")
        self.label_ids = self.labels.cat.codes
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_map = dict(enumerate(self.labels.cat.categories))

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.label_ids.iloc[idx])
        return item


# ---------------------------------------------------------
# 1) Charger votre fichier unique de 100 exemples
df = pd.read_csv("subset_labeled.tsv", sep="\t")

# 2) Train/test internes (20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42,
                                     stratify=df["label"])

# ---------------------------------------------------------
# 3) Préparation des données
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_ds = TextDataset(train_df, tokenizer)
test_ds = TextDataset(test_df, tokenizer)

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=8)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------------------------------------------------------
# 4) Modèle
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(train_df["label"].unique())
).to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

# ---------------------------------------------------------
# 5) Entraînement
for epoch in range(5):  # 5 epochs = idéal pour si peu de données
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"[Epoch {epoch+1}] Loss = {loss.item():.4f}")

# ---------------------------------------------------------
# 6) Évaluation
model.eval()
preds, gold = [], []

with torch.no_grad():
    for batch in test_loader:
        labels = batch["labels"]
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits.cpu()
        preds.extend(logits.argmax(dim=1).tolist())
        gold.extend(labels.tolist())

label_names = train_df["label"].astype("category").cat.categories
print(classification_report(gold, preds, target_names=label_names))

# ---------------------------------------------------------
# 7) Sauvegarde
model.save_pretrained("semantic_classifier_distilbert")
tokenizer.save_pretrained("semantic_classifier_distilbert")


ImportError: DLL load failed while importing _C: Le module spécifié est introuvable.