# Etude du Dataset SILICONE


In [1]:
from datasets import load_dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, Trainer, TrainingArguments
from tqdm import tqdm
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, TensorDataset
from tasknet import Adapter
import numpy as np
import evaluate
from sklearn.metrics import accuracy_score

from CustomTrainer import CustomTrainer

# Chargement du Dataset SILICONE

In [2]:
# Load SILICONE dataset in its Dyda_da config (only 4 possible dialog acts)

silicone_dyda = load_dataset('silicone', 'dyda_da')
dyda_train = silicone_dyda['train']
dyda_valid = silicone_dyda['validation']
dyda_test = silicone_dyda['test']

Found cached dataset silicone (C:/Users/robin/.cache/huggingface/datasets/silicone/dyda_da/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
dyda_train

Dataset({
    features: ['Utterance', 'Dialogue_Act', 'Dialogue_ID', 'Label', 'Idx'],
    num_rows: 87170
})

In [4]:
num_labels = 4
labels = ['commissive', 'directive', 'inform', 'question']
id2label = {0: 'commissive',
            1: 'directive', 
            2: 'inform', 
            3: 'question'
}
label2id = {value: key for key, value in id2label.items()}

In [5]:
dyda_train_df = pd.DataFrame.from_dict(dyda_train[:])
dyda_test_df = pd.DataFrame.from_dict(dyda_test[:])
dyda_test_df.head()

Unnamed: 0,Utterance,Dialogue_Act,Dialogue_ID,Label,Idx
0,"hey man , you wanna buy some weed ?",directive,1,1,0
1,some what ?,question,1,3,1
2,"weed ! you know ? pot , ganja , mary jane some...",directive,1,1,2
3,"oh , umm , no thanks .",commissive,1,0,3
4,i also have blow if you prefer to do a few lin...,directive,1,1,4


# Chargement du modèle préentraîné

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'microsoft/deberta-v3-base'

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name, 
    ignore_mismatched_sizes=True,
    num_labels=num_labels, 
    #id2label=id2label, label2id=label2id
).to(device)

classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a Be

# First Preprocessing

In [7]:
def tokenize_function(example):
    return tokenizer(example["Utterance"], truncation=True, max_length=128)

valid_tkz = dyda_valid.map(tokenize_function, batched=True)
train_tkz = dyda_train.map(tokenize_function, batched=True)

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/88 [00:00<?, ?ba/s]

In [8]:
train_tkz

Dataset({
    features: ['Utterance', 'Dialogue_Act', 'Dialogue_ID', 'Label', 'Idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 87170
})

In [9]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

samples = train_tkz[:8]
samples = {k: v for k, v in samples.items() if k not in ["Utterance", "Dialogue_Act", "Idx", "Dialogue_ID"]}
[len(x) for x in samples["input_ids"]]

[16, 16, 14, 27, 24, 23, 32, 31]

In [10]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'Label': torch.Size([8]),
 'input_ids': torch.Size([8, 32]),
 'token_type_ids': torch.Size([8, 32]),
 'attention_mask': torch.Size([8, 32])}

In [11]:
train_sample = train_tkz[:200]
train_sample = {k: v for k, v in train_sample.items() if k not in ["Utterance", "Dialogue_Act", "Idx", "Dialogue_ID"]}

valid_sample = valid_tkz[:100]
valid_sample = {k: v for k, v in valid_sample.items() if k not in ["Utterance", "Dialogue_Act", "Idx", "Dialogue_ID"]}

# Preprocessing V2

In [19]:
# build smaller dataset

dyda_train_sample = dyda_train[:1000]
dyda_valid_sample = dyda_valid[:200]
dyda_test_sample = dyda_test[:200]

In [20]:
train_inputs = tokenizer(dyda_train_sample['Utterance'], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
valid_inputs = tokenizer(dyda_valid_sample['Utterance'], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
test_inputs = tokenizer(dyda_test_sample['Utterance'], padding='max_length', truncation=True, max_length=128, return_tensors='pt')

#train_labels = tokenizer(dyda_train['Dialogue_Act'], padding='max_length', truncation=True, max_length=512, return_tensors='pt')
#valid_labels = tokenizer(dyda_valid['Dialogue_Act'], padding='max_length', truncation=True, max_length=512, return_tensors='pt')

In [21]:
train_inputs

{'input_ids': tensor([[   1,  504,  366,  ...,    0,    0,    0],
        [   1,  274,  391,  ...,    0,    0,    0],
        [   1,  339,  333,  ...,    0,    0,    0],
        ...,
        [   1, 9520,  358,  ...,    0,    0,    0],
        [   1,  278,  521,  ...,    0,    0,    0],
        [   1, 6359,  366,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [22]:
# Create a TensorDataset from the input data and labels
train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_inputs['token_type_ids'], torch.tensor(dyda_train_sample['Label']))
valid_dataset = TensorDataset(valid_inputs['input_ids'], valid_inputs['attention_mask'], valid_inputs['token_type_ids'], torch.tensor(dyda_valid_sample['Label']))
test_dataset = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], test_inputs['token_type_ids'], torch.tensor(dyda_test_sample['Label']))

# Create DataLoader objects for the training and validation sets
#train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
#valid_loader = DataLoader(valid_dataset, batch_size=16)

# Training

In [23]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mnli")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [24]:
# define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=50,
    save_total_limit=2,
    save_steps=50
)

# define the trainer object
trainer = CustomTrainer(
    model=model,                     # the instantiated 🤗 Transformers model to be trained
    args=training_args,                     # training arguments, defined above
    train_dataset=train_dataset,            # training dataset
    eval_dataset=valid_dataset,               # evaluation dataset
    #compute_metrics=lambda pred, labels: {"accuracy": accuracy_score(labels, pred.argmax(axis=1))},
    compute_metrics=compute_metrics,
    #data_collator=data_collator,
    #tokenizer=tokenizer,
    data_collator=lambda data: {'input_ids': torch.stack([item[0] for item in data]),
                                'attention_mask': torch.stack([item[1] for item in data]),
                                'token_type_ids': torch.stack([item[2] for item in data]),
                                'labels': torch.tensor([item[3] for item in data])},
)

# start the training process
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 189
  Number of trainable parameters = 184425220


Labels found
tensor(1.2075, grad_fn=<NllLossBackward0>)


Step,Training Loss,Validation Loss,Accuracy
50,1.0343,1.085936,0.565
100,0.4518,0.760842,0.665
150,0.4289,0.801556,0.66


Labels found
tensor(1.1248, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.2208, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.2197, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.2241, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.1884, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.1349, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.1553, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.1643, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.1742, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.2716, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.1728, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.1627, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.2228, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.2157, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.2128, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.2007, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.1487, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.1692, grad_fn=<NllLossBack

***** Running Evaluation *****
  Num examples = 200
  Batch size = 64


Labels found
tensor(1.0044)
Labels found
tensor(1.1832)
Labels found
tensor(1.0786)
Labels found
tensor(1.0187)


Saving model checkpoint to ./results\checkpoint-50
Configuration saved in ./results\checkpoint-50\config.json
Model weights saved in ./results\checkpoint-50\pytorch_model.bin
Deleting older checkpoint [results\checkpoint-1200] due to args.save_total_limit


Labels found
tensor(0.8618, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.0041, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.1532, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.0028, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.0012, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.7952, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.8400, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.9843, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.7260, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.8961, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.8270, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.9467, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.6696, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.8248, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.7335, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.0076, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.9922, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.7179, grad_fn=<NllLossBack

***** Running Evaluation *****
  Num examples = 200
  Batch size = 64


Labels found
tensor(0.6781)
Labels found
tensor(0.9090)
Labels found
tensor(0.7284)
Labels found
tensor(0.4964)


Saving model checkpoint to ./results\checkpoint-100
Configuration saved in ./results\checkpoint-100\config.json
Model weights saved in ./results\checkpoint-100\pytorch_model.bin
Deleting older checkpoint [results\checkpoint-1250] due to args.save_total_limit


Labels found
tensor(0.5680, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.5245, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.4876, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.4085, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.4166, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.3103, grad_fn=<NllLossBackward0>)
Labels found
tensor(1.2012, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.3892, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.3816, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.4084, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.2083, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.4001, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.3152, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.4890, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.2406, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.2422, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.2331, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.6227, grad_fn=<NllLossBack

***** Running Evaluation *****
  Num examples = 200
  Batch size = 64


Labels found
tensor(0.8738)
Labels found
tensor(0.8790)
Labels found
tensor(0.6974)
Labels found
tensor(0.4378)


Saving model checkpoint to ./results\checkpoint-150
Configuration saved in ./results\checkpoint-150\config.json
Model weights saved in ./results\checkpoint-150\pytorch_model.bin
Deleting older checkpoint [results\checkpoint-50] due to args.save_total_limit


Labels found
tensor(0.7996, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.2914, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.6523, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.3515, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.1466, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.3344, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.6816, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.4684, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.2647, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.4515, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.2130, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.1637, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.4327, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.4467, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.5302, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.6924, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.1585, grad_fn=<NllLossBackward0>)
Labels found
tensor(0.3812, grad_fn=<NllLossBack



Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=189, training_loss=0.6893611547177431, metrics={'train_runtime': 2274.09, 'train_samples_per_second': 1.319, 'train_steps_per_second': 0.083, 'total_flos': 197340374016000.0, 'train_loss': 0.6893611547177431, 'epoch': 3.0})

# Prediction

In [25]:
predictions = trainer.predict(test_dataset)
print(predictions.predictions.shape, predictions.label_ids.shape)

preds = np.argmax(predictions.predictions, axis=-1)

***** Running Prediction *****
  Num examples = 200
  Batch size = 64


Labels found
tensor(0.4939)


Labels found
tensor(0.7058)
Labels found
tensor(1.0218)
Labels found
tensor(0.2191)
(200, 4) (200,)


In [26]:
accuracy = evaluate.load("accuracy")
accuracy.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.63}

On passe de 0.31 à 0.38 d'accuracy en finetunant sur 200 utterances

# Evaluation

In [27]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 200
  Batch size = 64


Labels found
tensor(0.6135)


Labels found
tensor(0.7651)
Labels found
tensor(0.6277)
Labels found
tensor(0.4445)


{'eval_loss': 0.659791111946106,
 'eval_accuracy': 0.68,
 'eval_runtime': 42.0975,
 'eval_samples_per_second': 4.751,
 'eval_steps_per_second': 0.095,
 'epoch': 3.0}

In [15]:
# Prediction

#model_inputs_test = tokenizer(dyda_test_sample['Utterance'], padding='max_length', truncation=True, max_length=512, return_tensors='pt')
model_inputs_test = test_inputs['input_ids'].tolist()
preds = []
for input in model_inputs_test:
    pred = model(torch.tensor([input]))
    preds.append(pred)
#preds = model(model_inputs_test)

In [16]:
print(preds[:5])

[SequenceClassifierOutput(loss=None, logits=tensor([[-0.1522, -0.1729,  0.0872,  0.0323]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=None, logits=tensor([[-0.2559, -0.1825, -0.1750,  0.0066]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=None, logits=tensor([[-0.3809, -0.1858, -0.0521,  0.0344]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=None, logits=tensor([[-0.3262, -0.0846, -0.0612, -0.0484]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None), SequenceClassifierOutput(loss=None, logits=tensor([[-0.2262, -0.1908,  0.0267, -0.0462]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]


In [1]:
# Evaluation

trainer = CustomTrainer(
    model=model,                     # the instantiated 🤗 Transformers model to be trained
    args=training_args,                     # training arguments, defined above
    train_dataset=train_dataset,            # training dataset
    eval_dataset=valid_dataset,               # evaluation dataset
    #compute_metrics=lambda pred, labels: {"accuracy": accuracy_score(labels, pred.argmax(axis=1))},
    #data_collator=data_collator,
    #tokenizer=tokenizer,
    data_collator=lambda data: {'input_ids': torch.stack([item[0] for item in data]),
                                'attention_mask': torch.stack([item[1] for item in data]),
                                'token_type_ids': torch.stack([item[2] for item in data]),
                                'labels': torch.tensor([item[3] for item in data])},
)

trainer.predict(test_dataset)

NameError: name 'CustomTrainer' is not defined

In [None]:
# Model explainability (confusion matrix) ?