This file is used to run the code on Google Colab.
The parameters of the Trainer are set to use the GPU A100.

In [None]:
!pip install -q transformers datasets accelerate

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
import csv
from functools import partial

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

# Deactivate WandB
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"
os.environ["WANDB_PROJECT"] = "disabled"

# Make sure the notebook uses the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [None]:
from pathlib import Path
from google.colab import drive

# Mount Drive to save progression
drive.mount('/content/drive')

DRIVE_ROOT = Path('/content/drive/MyDrive')
PROJECT_DIR = DRIVE_ROOT / 'deberta_training'
DATA_DIR = PROJECT_DIR / 'data'
ARTIFACTS_DIR = PROJECT_DIR / 'artifacts'
CHECKPOINT_DIR = ARTIFACTS_DIR / 'checkpoints'
BEST_MODEL_DIR = ARTIFACTS_DIR / 'best_model'
PREDICTIONS_DIR = ARTIFACTS_DIR / 'predictions'
LOGS_DIR = ARTIFACTS_DIR / 'logs'

for d in [PROJECT_DIR, DATA_DIR, ARTIFACTS_DIR, CHECKPOINT_DIR, BEST_MODEL_DIR, PREDICTIONS_DIR, LOGS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print('Location on Drive', PROJECT_DIR)
print('Place the files tweets_train.csv and tweets_val.csv in:', DATA_DIR)

# Load Datasets

In [None]:
train_path = DATA_DIR / 'tweets_train.csv'
val_path = DATA_DIR / 'tweets_val.csv'

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

train_set = Dataset.from_pandas(train_df[["text", "label"]])
val_set = Dataset.from_pandas(val_df[["text", "label"]])

# Tokenize

In [None]:
from datasets import load_from_disk

TOKENIZED_TRAIN_DIR = DATA_DIR / "train_tokenized"
TOKENIZED_VAL_DIR   = DATA_DIR / "val_tokenized"

max_length = 128

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large", use_fast=True)

tokenize_fct = partial(
    tokenizer, truncation=True, max_length=max_length
)

# Check previous tokenization to fasten
if TOKENIZED_TRAIN_DIR.exists() and TOKENIZED_VAL_DIR.exists():
    train_tokenized = load_from_disk(TOKENIZED_TRAIN_DIR)
    val_tokenized = load_from_disk(TOKENIZED_VAL_DIR)

else:
    # Tokenize datasets using DeBERTa tokenizer
    print("Tokenizing datasets for DeBERTa...\n")

    train_tokenized = train_set.map(
        lambda batch: tokenize_fct(batch["text"]), batched=True
    )
    val_tokenized = val_set.map(
        lambda batch: tokenize_fct(batch["text"]), batched=True
    )

    train_tokenized = train_tokenized.remove_columns(["text"])
    val_tokenized = val_tokenized.remove_columns(["text"])
    train_tokenized.set_format("torch")
    val_tokenized.set_format("torch")

    train_tokenized.save_to_disk(DATA_DIR / "train_tokenized")
    val_tokenized.save_to_disk(DATA_DIR / "val_tokenized")

train_tokenized, val_tokenized

# Metrics Functions


In [None]:
def accuracy_fn(preds, labels):
    return np.mean(labels == preds)

def f1_score(preds, labels):
    tp = np.sum((labels == 1) & (preds == 1))
    fp = np.sum((labels != 1) & (preds == 1))
    if tp + fp == 0:
        return 0.0
    precision = tp / (tp + fp)

    tp = np.sum((labels == 1) & (preds == 1))
    fn = np.sum((labels == 1) & (preds != 1))
    if tp + fn == 0:
        return 0.0
    recall = tp / (tp + fn)

    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

label2id = {"NEG": 0, "POS": 1}
id2label = {v: k for k, v in label2id.items()}

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_fn(preds, labels),
        "f1": f1_score(preds, labels),
    }


# DeBERTa Training

In [None]:
from datasets import load_from_disk
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=8,
)

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

train_tokenized = load_from_disk(DATA_DIR / "train_tokenized")
val_tokenized = load_from_disk(DATA_DIR / "val_tokenized")

model = AutoModelForSequenceClassification.from_pretrained(
    'microsoft/deberta-v3-large',
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
).to(device)

model.config.problem_type = "single_label_classification"
output_dir = str(CHECKPOINT_DIR)

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy='steps',
    save_strategy='steps',
    eval_steps=10000,
    save_steps=20000,
    logging_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',  # Optimize for accuracy
    greater_is_better=True,
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=1,
    warmup_ratio=0.1,
    weight_decay=0.01,

    bf16=True,                         # Set to True because of Colab
    tf32=True,
    report_to='none',
    optim="adamw_torch_fused",
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    gradient_accumulation_steps=1,
    group_by_length=False,
    disable_tqdm=True

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Starting DeBERTa training...\n")
if any(CHECKPOINT_DIR.iterdir()):
    print("Use last checkpoint to restart")
    trainer.train(resume_from_checkpoint=True)
else:
    print("Start from anew")
    trainer.train()


# Save best model


In [None]:
from google.colab import files

# Save best model on the Drive
best_model_dir = BEST_MODEL_DIR
best_model_dir.mkdir(parents=True, exist_ok=True)

trainer.save_model(str(best_model_dir))
tokenizer.save_pretrained(str(best_model_dir))

# Download best model
files.download(str(best_model_dir))

# Prediction

In [None]:
test_path = DATA_DIR / "test_data.txt"

tweets = []
with open(test_path, "r", encoding="utf-8") as f:
    tweets = [line.rstrip("\n") for line in f if line.strip() != ""]

indices = np.arange(1, len(tweets)+1)
test_dataset = Dataset.from_dict({"text": tweets})

In [None]:
test_tokenized = test_dataset.map(lambda batch: tokenize_fct(batch["text"]), batched=True)
test_tokenized = test_tokenized.remove_columns(["text"])
test_tokenized.set_format("torch")

In [None]:
model.eval()
model.to(device)

dataloader = DataLoader(
    test_tokenized, 
    batch_size=32, 
    shuffle=False,
    collate_fn=data_collator,
)

all_preds = []
with torch.no_grad():
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        all_preds.append(preds.cpu().numpy())

all_preds = np.concatenate(all_preds, axis=0)

In [None]:
# Convert 0/1 to -1/1
y_pred = 2 * all_preds - 1

if not all(i in [-1, 1] for i in y_pred):
    raise ValueError("y_pred can only contain values -1, 1")

submission = pd.DataFrame({'Id': indices, 'Prediction': y_pred})
csv_name = PREDICTIONS_DIR / 'deberta_submission.csv'
with open(csv_name, "w", newline="") as csvfile:
        fieldnames = ["Id", "Prediction"]
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(indices, y_pred):
            writer.writerow({"Id": int(r1), "Prediction": int(r2)})

In [None]:
from google.colab import files

# Download submission file
files.download(str(csv_name))