In [1]:
# First, you should fill in these paths
_local = "Path that points to this repository's clone"
Teacher_path = "Path that points to the teacher model"

In [2]:
# Setup
import sys
sys.path.append(_local)

In [None]:
# Loading the teacher model
from transformers import CamembertForSequenceClassification
Teacher_model = CamembertForSequenceClassification.from_pretrained(Teacher_path)

In [None]:
# Load the dataset
from dataset.tanda_dataset import TandaDataset

# The dataset can be loaded from a list of paragraphs where each paragraph is formated this way
sample_paragraph = {
    'questions' : ['Is this a question?'],
    'passages' : [
        'Yes, this is a question.',
        'This passage is not an answer, but it can be used to form a tanda couple.',
    ],
    'answer_ids' : [0]
}

# Or from a squad-like file, such as Piaf or FQuAD
Dataset = TandaDataset.from_squad_like('A path to a squad-like file')

In [None]:
# Split the dataset

# For reproducibility's sake, let's input a seed
Seed = 42

# Here, Train will be 60% of the Dataset's total size
Train, ValAndTest = Dataset.split(0.6, Seed)
# And Val and Test will each be 20% of the Dataset's total size
Val, Test = ValAndTest.split(0.5, Seed)

In [None]:
# Tokenize the dataset
# Tokenization is always done beforehand

from transformers import CamembertTokenizer

Train.tokenize(CamembertTokenizer.from_pretrained('camembert-base'), max_length=256)
Val.tokenize(CamembertTokenizer.from_pretrained('camembert-base'), max_length=256)
Test.tokenize(CamembertTokenizer.from_pretrained('camembert-base'), max_length=256)

In [None]:
# Actual distillation model

from models.distilCamembertForSequenceClassification import DistilledCamembertForSequenceClassification

# To understand this parameter, look up softmax temperature
# You can leave this at 1
# The higher the temperature, the lower the models' confidences
SoftmaxTemperature = 2

DistilledCamembert = DistilledCamembertForSequenceClassification(
    teacher=Teacher_model, 
    temperature=SoftmaxTemperature, 
    )
    
DistilledCamembert.cuda()

In [None]:
# A logger to log in the model's training

from logger import Logger as LoggerClass

# For more informations on available metrics, look up the parse function in metrics.__init__
Logger = LoggerClass(metric_descriptions=['train loss', 'train accuracy', 'train f1'])

In [None]:
# Fitting method 

import torch

DistilledCamembert.fit(
    # Classic torch optimizer
    optimizer=torch.optim.AdamW(params=DistilledCamembert.parameters(), lr=1e-5), 
    # Logger
    logger=Logger, 
    # Number of epochs
    epochs=10,
    # Training dataset
    training_dataset=Train,
    # Validation dataset (optional)
    validation_dataset=Val,
    # Forces the datasets to yeild 50% of couple labeled 0 and 50% labeled 1
    force_balance=True,
    # Batch size
    batch_size=8,
    # Frequency on which to do backpropagation, if =2 then there is backpropagation every 2 batches
    backpropagation_frequency = 1,
    # Whether or not to run a validation before the first epoch
    pre_training_validation=False,
    # Maximum length of each input
    max_length=128,
    # Display progress at the end of each epoch or not
    verbose=True,
    # You can set a new temperature if it hasn't been done yet
    temperature=2,
)