In [32]:
# Load all the necessary libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers import AutoTokenizer, FlaubertForSequenceClassification
from sklearn import metrics

In [6]:
# Function to open data
def open_data(name):
    return pd.read_csv(name, index_col=0)

In [9]:
# Read data
train_data = open_data("Path to you train_set")

In [13]:
# Define pretrained tokenizer and model

model_name = 'flaubert/flaubert_base_cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = FlaubertForSequenceClassification.from_pretrained(model_name, num_labels=11)

Some weights of the model checkpoint at flaubert/flaubert_base_cased were not used when initializing FlaubertForSequenceClassification: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing FlaubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FlaubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_base_cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [None]:
# Preprocess data
X = list(train_data["Phrases"])
y = list(train_data["Labels"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [14]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
type(train_dataset)

In [22]:
# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(pred, labels):
    #pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred,  average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=100,
    save_steps=1000,
    seed=0,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)


In [None]:
# Train pre-trained model
trainer.train()

In [11]:
# Upload your test data
test_data = open_data("Path to your test_set")

In [23]:
test_data

Unnamed: 0,text-fr,intent-fr,labels
423,pouvez-vous me procurer une table pour 5 à joh...,réservation_de_restaurant,0
10707,obtenez-moi une chambre d’hôtel assez grande p...,réserver_un_hôtel,9
10696,J’ai besoin d’obtenir une chambre d’hôtel asse...,réserver_un_hôtel,9
7079,quand mon vol atterrira-t-il,état_du_vol,5
2309,Je vais bientôt en Allemagne ai-je besoin d’un...,visa_international,4
...,...,...,...
5166,"quels hôtels sont à proximité, ny, ny pour le ...",réserver_un_hôtel,9
1671,annuler la réservation du dîner de ce soir,annuler_la_réservation,2
14504,J’ai besoin d’un vol de denver international à...,réserver_un_vol,7
14835,puis-je louer une voiture de l’aéroport de dfw...,location_de_voiture,8


In [24]:
# ----- 3. Predict -----#
# Load test data
X_test = list(test_data["text-fr"])
y_test = list(test_data["labels"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

In [25]:
# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

In [26]:
# Load trained mode
model_path = "output/checkpoint-1000"
model = FlaubertForSequenceClassification.from_pretrained(model_path, num_labels=11)

In [27]:
# Define test trainer
test_trainer = Trainer(model)

In [28]:
# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)

In [31]:
#Get metrics
compute_metrics(raw_pred, y_test)