# Projet PLDAC
- Fichier pour exécuter les modèles de langages de Hugging Face à fine-tuner
- A exécuter sur Google Colab de préférence

## Imports and Functions

In [None]:
# Exécuter ce code, puis redémarrer la session

%pip install accelerate -U

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os

# Datasets and functions
from sklearn import datasets
from sklearn.model_selection import train_test_split

# Pré-modèles
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
# -------------------------------------------
# TRANSFORMATION DATAFRAME

# Transformation de la valeur en texte
def text_type_select(value, feature, select=0) :
    """ Retourne un type de texte selon la sélection
    0. <value> <feature>
    1. <value> <unit> of <name> (avec feature = (<name>, <unit>))

    Args:
        - `value` (float): la valeur du feature
        - `feature` (str): le nom de feature
        - `select` (int, optional): Type de texte à sélectionner. Defaults to 0.

    Returns:
        str: Texte retourné
    """
    if select == 0 :
        return str(value) + " " + str(feature)
    if select == 1 :
        name, unit = feature
        return str(value) + " " + str(unit) + " of " + str(name)

def row_text_type_select(Xi:pd.DataFrame, yi:pd.DataFrame, feature_names, has_unit_vector, subject_name = "subject", select=0) :
  # generate text type
    value_feature_text_list = [text_type_select(value, feature, select=1) if has_unit_vector[i] \
                               else text_type_select(value, feature, select=0) \
                               for i, (value, feature) in enumerate(zip(Xi, feature_names))]

    # generate text
    text = "The " + subject_name + " with "
    for vf in value_feature_text_list[:-1] :
        text += vf + ", "
    text = text[:-2]
    text += " and " + value_feature_text_list[-1]
    if select == 0 :
      text += " is " + str(yi)
    return text

# Transformation d'une ligne en texte
def df_row_to_text(Xi:pd.DataFrame, yi:pd.DataFrame, feature_names, subject_name = "subject", has_unit=False, select=0) :
    """ Pour une ligne d'un DataFrame, génère un texte expliquant la ligne

    Args:
        - `Xi` (pd.DataFrame): Un exemple de features
        - `yi` (pd.DataFrame): Un label
        - `feature_names` (List[str]): une liste de colonnes
        - `subject_name` (str, optional): Le nom du sujet pour l'affichage. Defaults to "subject".
        - `has_unit` (bool, optional): _description_. Defaults to True.

    Returns:
        str: Le texte généré
    """

    # features name - unit
    if has_unit :
        try :
            regex_unit_pattern = r'([\w ]+)\s+\((\w+)\)$'
            feature_names = [re.search(regex_unit_pattern, feature).groups() for feature in feature_names]
            has_unit_vector = [True if len(tuple) == 2 else False for tuple in feature_names]
        except :
            has_unit_vector = [False for _ in feature_names]
    else :
        has_unit_vector = [False for _ in feature_names]

    return row_text_type_select(Xi, yi, feature_names, has_unit_vector, subject_name = subject_name, select=select)

# Liste de textes
def df_texts_list(X:pd.DataFrame, y:pd.DataFrame, **kwargs) :
    """ Génère la liste de textes à partir des features X et des labels y

    Args:
        - `X` (pd.DataFrame): features
        - `y` (pd.DataFrame): labels
        - `subject_name` (str, optional): _description_. Defaults to "subject".
        - `has_unit` (bool, optional): _description_. Defaults to False.

    Returns:
        List[str]: une liste de textes des lignes du dataset
    """
    return [df_row_to_text(X.iloc[i], y.iloc[i], X.columns, **kwargs) for i in range(len(X))]

# DataFrame Texte (Main)
def data_to_df_text(data, target=None, feature_names=None, target_names=None, **kwargs) :
    """

    Args:
        - `data` (Dict | pd.DataFrame): Si `target` n'est pas spécifié, alors un dictionnaire contenant
        'data', 'target', 'feature_names' et 'target_names', sinon un DataFrame de features
        - `target` (pd.DataFrame, optional): Un DataFrame de labels. Defaults to None.
        - `feature_names` (List[str], optional): Une liste de noms de features. Defaults to None.
        - `target_names` (List[str], optional): Une liste de noms de labels. Defaults to None.

    Returns:
        pd.Series: liste de textes transformés en pd.Series
    """
    if target is None :
        if 'feature_names' in data :
            X = pd.DataFrame(data['data'], columns=data['feature_names'])
        else :
            X = pd.DataFrame(data['data'])
        y = pd.Series(data['target'])
        if 'target_names' in data :
            y = y.map({i:label for i, label in enumerate(data['target_names'])})
    else :
        if feature_names is not None :
            X = pd.DataFrame(data, columns=feature_names)
        else :
            X = pd.DataFrame(data)
        y = pd.Series(target)
        if target_names is not None :
            y = y.map({i:label for i, label in enumerate(target_names)})
    return pd.Series(df_texts_list(X, y, **kwargs))

# DataFrame Display (Main)
def display_df(df:pd.DataFrame) :
    """ Permet de voir le DataFrame en entier

    Args:
        - `df` (pd.DataFrame): Le DataFrame à afficher
    """
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_colwidth', None)
    print(df)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_colwidth')

# -------------------------------------------
# TRAINING

# Training (Main) à améliorer (comme ajouter des métriques)
def train_clf(clf, X_train, y_train) :
    """_summary_

    Args:
        - `clf` (_type_): classifieur à entraîner
        - `X_train` (pd.DataFrame): features
        - `y_train` (pd.DataFrame): labels
    """
    clf.fit(X_train, y_train)
    print(clf.predict(X_train))

# -------------------------------------------
# TRAITEMENT

# Récupérer la liste de features et le nom de label de la question
def question_to_list(q) :
    """ Décompose la question en liste de features et détecte le nom type de label

    Args:
        - `q` (str): La question sous le format : "When we have <feature_1>=<value_1>,
        <feature_2>=<value_2>, ..., <feature_n>=<value_n>, what should be the <label_type_name> ?"

    Returns:
        List[str], str: une liste de features et un nom type de label
    """
    pattern = r'be (.*) \?'
    match = re.search(pattern, q)
    a_label = match.group(1)

    pattern = r'have (.*), what'
    match = re.search(pattern, q.strip())
    a_features = match.group(1)
    a_list_features = a_features.split(',')
    a_list_features = [feature.strip() for feature in a_list_features]

    return a_list_features, a_label


# Transformation d'une ligne de Dataset en question (pour pouvoir tester et vérifier)
def df_row_to_question(X:pd.DataFrame, feature_names, subject="subject") :
    # Question --> When we have sepal length (cm)=0.3, sepal width (cm)=0.4, petal length (cm)=0.1, petal width (cm)=0.2, what should be the iris type ?
    # Réponse --> When we have sepal length (cm)=0.3, sepal width (cm)=0.4, petal length (cm)=0.1, petal width (cm)=0.2, the iris type is setosa

    list_res = []
    for index, row in X.iterrows() :
        res = "When we have "
        # print("----------------")
        for feature, value in zip(feature_names, row) :
            # print(feature, ":", value)
            res += feature + "=" + str(value) + ", "
        res += "what should be the " + subject + " ?"
        list_res.append(res)
    return list_res

# Transformer en DataFrame pour faire passer dans le calcul de prédiction
def question_to_df(q) :
    """ Transforme la question en DataFrame permettant de mieux analyser (faire passer en predict d'un classifieur)

    Args:
        - `q` (str): La question sous le format : "When we have <feature_1>=<value_1>,
        <feature_2>=<value_2>, ..., <feature_n>=<value_n>, what should be the <label_type_name> ?"

    Returns:
        pd.DataFrame: un DataFrame correspondant à la question
    """
    a_list_features, _ = question_to_list(q)
    a_list_features_split = [feature.split('=') for feature in a_list_features]
    a_features_names = [feature[0] for feature in a_list_features_split]
    try :
        a_features_values = [float(feature[1]) for feature in a_list_features_split]
    except :
        a_features_values = [feature[1] for feature in a_list_features_split]
    return pd.DataFrame(data=np.array([a_features_values]), columns=a_features_names)

# Prédiction de la réponse
def q_df_to_answer(clf, df) :
    """ Prédit le label grâce au DataFrame généré par la question

    Args:
        - `clf` (_type_): un classifieur
        - `df` (pd.DataFrame): un DataFrame

    Returns:
        str: le label prédit
    """
    return clf.predict(df)[0]

# Format de q et a en texte
def answer_to_text(q, a) :
    """ Permet de rendre les questions et le label réponse dans un format qui donne la réponse

    Args:
        - `q` (str): question
        - `a` (str): label réponse

    Returns:
        str: le format de la réponse que le modèle doit donner
    """
    _, a_label = question_to_list(q)
    pattern = r'what .* \?'
    qa = re.sub(pattern, '', q.strip())
    return qa.strip() + " " + a_label + " is " + str(a)

# Réponse du programme à partir de la question
def traitement_question(clf, q) :
    """ Traitement de la question `q` qui utilise un classifieur `clf` pour générer une réponse

    Args:
        - `clf` (_type_): classifieur
        - `q` (str): question

    Returns:
        str: la réponse à la question `q`
    """
    q_df = question_to_df(q)
    a = q_df_to_answer(clf, q_df)
    return answer_to_text(q, a)

In [3]:
# Fine-tuning du modèle
def train_premodel(model_name="gpt2", dataset_path="./iris_dataset.txt", save_path="./fine-tuned-gpt2") :
    """ Permet de fine-tuner un modèle pré-entraîné de hugging face, ici c'est plus spécifiquement GPT-2

    Args:
        - `model_name` (str, optional): le nom du modèle de hugging face. Defaults to "gpt2".
        - `dataset_path` (str, optional): le chemin d'accès du dataset. Defaults to "./iris_dataset.txt".
        - `save_path` (str, optional): le chemin d'accès pour sauvegarder le modèle fine-tuné. Defaults to "./fine-tuned-gpt2".
    """

    # Load pre-trained GPT-2 model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    dataset = TextDataset(tokenizer=tokenizer, file_path=dataset_path, block_size=128)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=save_path,
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=4,
        save_total_limit=2,
    )

    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    model.save_pretrained(save_path)

# Récupérer le model et le tokenizer
def get_model_tokenizer(model_path="./fine-tuned-gpt2", tokenizer_path="gpt2") :
    """ Récupérer le modèle et le tokenizer

    Args:
        - `model_path` (str, optional): le chemin d'accès du modèle fine-tuné. Defaults to "./fine-tuned-gpt2".
        - `tokenizer_path` (str, optional): le chemin d'accès du tokenizer. Defaults to "gpt2".

    Returns:
        _type_: le model et le tokenizer
    """
    # Load fine-tuned GPT-2 model and tokenizer
    model_path = model_path
    tokenizer_path = tokenizer_path
    model = GPT2LMHeadModel.from_pretrained(model_path, local_files_only=True)
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return model, tokenizer

# Tester le modèle
def test_premodel(model, tokenizer, prompt="The flower with", max_length=100, temperature=1, top_k=1) :
    """ Teste le modèle fine-tuné

    Args:
        - `model` (_type_): le modèle
        - `tokenizer` (_type_): le tokenizer
        - `prompt` (str, optional): le prompt avec lequel le modèle va générer un texte. Defaults to "The flower with".
        - `max_length` (int, optional): longueur maximale de la réponse. Defaults to 100.
        - `temperature` (int, optional): température de la réponse. Defaults to 1.
        - `top_k` (int, optional): le top k de la réponse. Defaults to 1.
    """

    # Generate text samples
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, temperature=temperature, top_k=top_k) # top_k=50

    # Decode generated output
    generated_texts = tokenizer.batch_decode(output, skip_special_tokens=True)

    # Print generated texts
    print(generated_texts[0])

## 1. Préparation du Dataset

In [4]:
# 1. Chargement du Dataset Iris

iris = datasets.load_iris()
X = pd.DataFrame(iris['data'], columns=iris['feature_names'])
y = pd.Series(iris['target']).map({i:label for i, label in enumerate(iris['target_names'])})

In [5]:
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

In [6]:
# 2.3. Création d'un nouveau DF

iris_text_df = data_to_df_text(iris, subject_name="flower", has_unit=True)
iris_text_df.head(10)

0    The flower with 5.1 cm of sepal length, 3.5 cm...
1    The flower with 4.9 cm of sepal length, 3.0 cm...
2    The flower with 4.7 cm of sepal length, 3.2 cm...
3    The flower with 4.6 cm of sepal length, 3.1 cm...
4    The flower with 5.0 cm of sepal length, 3.6 cm...
5    The flower with 5.4 cm of sepal length, 3.9 cm...
6    The flower with 4.6 cm of sepal length, 3.4 cm...
7    The flower with 5.0 cm of sepal length, 3.4 cm...
8    The flower with 4.4 cm of sepal length, 2.9 cm...
9    The flower with 4.9 cm of sepal length, 3.1 cm...
dtype: object

## 2. Avec modèles de langage pré-entraînés

### GPT-2

#### Tests sans Fine-tuning

In [22]:
# Test GPT-2 without fine-tuning

# ---------------------------------
# First row : The flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is a setosa.
# Row 51 : The flower with 7.0 cm of sepal length, 3.2 cm of sepal width, 4.7 cm of petal length and 1.4 cm of petal width is a versicolor.
# Row 101 : The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a virginica.
# ---------------------------------

# Récupérer le modèle
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [23]:
test_premodel(model, tokenizer, "Once upon a time", max_length=150)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger.


In [24]:
test_premodel(model, tokenizer, "The flower with", max_length=150)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The flower with the red flower is the same as the one with the blue flower.

The flower with the red flower is the same as the one with the blue flower. The flower with the green flower is the same as the one with the red flower.

The flower with the green flower is the same as the one with the red flower. The flower with the yellow flower is the same as the one with the yellow flower.

The flower with the yellow flower is the same as the one with the yellow flower. The flower with the purple flower is the same as the one with the purple flower.

The flower with the purple flower is the same as the one with the purple flower. The flower with the red flower


In [25]:
test_premodel(model, tokenizer, "The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a ", max_length=300)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a ichthyosaur.

The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a ichthyosaur. The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a ichthyosaur.

The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a ichthyosaur. The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a ichthyosaur.

The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a ichthyosaur. The flower with 6.3 cm of sepal length, 3.3 cm of se


In [26]:
test_premodel(model, tokenizer, "The iris with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a ", max_length=150)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The iris with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a vernal iris. The iris is a small, round, oval shaped iris with a diameter of about 1.5 cm. The iris is a small, round, oval shaped iris with a diameter of about 1.5 cm. The iris is a small, round, oval shaped iris with a diameter of about 1.5 cm. The iris is a small, round, oval shaped iris with a diameter of about 1.5 cm. The iris is a small, round, oval shaped


In [27]:
test_premodel(model, tokenizer, "The iris with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a virginica because", max_length=150)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The iris with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a virginica because it is a virginica with a very high degree of sepal length.

The iris with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a virginica because it is a virginica with a very high degree of sepal length. The iris with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length


#### Fine-tuning

In [32]:
directory = 'ft_models'
if not os.path.exists(directory) :
    os.mkdir(path=directory)

In [33]:
# 4. Ecriture dans un fichier dataset.txt
# Parce qu'on fait passer un fichier de texte pour le training et non un dataset avec un texte et un label

iris_text_df = data_to_df_text(iris, subject_name="flower", has_unit=True, select=0)
with open('ft_models/iris_dataset.txt', 'w') as file :
    for text in iris_text_df :
        file.write(text + '.\n')

In [34]:
# Définition des paths et création de nouveaux dossiers

try :
    with open('ft_models/counter.txt', 'r') as file :
        counter = file.read().strip()
except :
    with open('ft_models/counter.txt', 'w') as file :
        file.write('1')
        counter = '1'

with open('ft_models/counter.txt', 'w') as file :
    file.write(str(int(counter)+1))

folder = './ft_models/'
folder_model = 'model' + counter + '/'
model_name = 'gpt2'
dataset_path = 'iris_dataset.txt'
save_path = 'fine-tuned-gpt2'

path = os.path.join(folder, folder_model)
os.mkdir(path)

print("Counter :", counter, "| Update the counter file to overwrite old models folders")
print("Directory '% s' created" % (folder + folder_model))
print("model_name : %s" % model_name)
print("dataset_path : %s" % folder + dataset_path)
print("save_path : %s" % folder + folder_model + save_path)

Counter : 2 | Update the counter file to overwrite old models folders
Directory './ft_models/model2/' created
model_name : gpt2
dataset_path : ./ft_models/iris_dataset.txt
save_path : ./ft_models/model2/fine-tuned-gpt2


In [35]:
# 5. Hugging Face GPT-2 Training (mettre un temps pour calculer le temps du training)
# 15 min sur le dataset iris

train_premodel(
    model_name = model_name,
    dataset_path = folder + dataset_path,
    save_path = folder + folder_model + save_path
)

Step,Training Loss


#### Tests avec Fine-tuning

In [36]:
# 6.1. Hugging Face GPT-2 Testing

# counter = '1' # Vaut mieux directement modifier le fichier 'counter.txt' mais comme ça, ça fonctionne aussi

folder = './ft_models/'
folder_model = 'model' + counter + '/'
model_name = 'gpt2'
dataset_path = 'iris_dataset.txt'
save_path = 'fine-tuned-gpt2'

model, tokenizer = get_model_tokenizer(
    model_path = folder + folder_model + save_path,
    tokenizer_path = model_name
)

In [37]:
# 6.2. Hugging Face GPT-2 Testing

# ---------------------------------
# First row : The flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is a setosa.
# Row 51 : The flower with 7.0 cm of sepal length, 3.2 cm of sepal width, 4.7 cm of petal length and 1.4 cm of petal width is a versicolor.
# Row 101 : The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a virginica.
# ---------------------------------

In [38]:
# Notre base de données commence par cette phrase

test_premodel(model, tokenizer, "The flower with", max_length=300)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a flower with a fl

In [39]:
# Sample from Test set

test_premodel(model, tokenizer, "The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is", max_length=300)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of pet


In [40]:
# Sample from Test set

test_premodel(model, tokenizer, "The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is virginica because ", max_length=300)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a virginica because  it has a flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a virginica because  it has a flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a virginica because  it has a flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a virginica because  it has a flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a virginica because  it has a flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a virginica because  it has a flower with 6.3 cm of sepal length, 3.3 cm of se


In [41]:
# Sample from Train set

test_premodel(model, tokenizer, "The flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is", max_length=300)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is a flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is a flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is a flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is a flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is a flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is a flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is a flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of pet


### BERT

#### Mask-Filling

In [None]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

# Load the pre-trained BERT
model_name = "bert-base-uncased"
model = BertForMaskedLM.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Input text
input_text = "The flower with 7.0 cm of sepal length, 3.2 cm of sepal width, 4.7 cm of petal length and 1.4 cm of petal width is a [MASK]."
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Inference
with torch.no_grad():
  outputs = model(input_ids)

# Get the predicted logits
masked_index = input_ids[0].tolist().index(tokenizer.mask_token_id)
masked_logits = outputs.logits[0, masked_index]

top_k = 10
top_k_indices = masked_logits.topk(top_k).indices.tolist()
predicted_tokens = [tokenizer.convert_ids_to_tokens(index) for index in top_k_indices]

print("Predicted tokens for the masked position:", predicted_tokens)

#### Preparations

In [7]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
iris_text_df = data_to_df_text(iris, subject_name="flower", has_unit=True, select=0)
X = list(iris_text_df)
y = list(iris['target'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
X_train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [9]:
display_df(iris_text_df)

0          The flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is setosa
1          The flower with 4.9 cm of sepal length, 3.0 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is setosa
2          The flower with 4.7 cm of sepal length, 3.2 cm of sepal width, 1.3 cm of petal length and 0.2 cm of petal width is setosa
3          The flower with 4.6 cm of sepal length, 3.1 cm of sepal width, 1.5 cm of petal length and 0.2 cm of petal width is setosa
4          The flower with 5.0 cm of sepal length, 3.6 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is setosa
5          The flower with 5.4 cm of sepal length, 3.9 cm of sepal width, 1.7 cm of petal length and 0.4 cm of petal width is setosa
6          The flower with 4.6 cm of sepal length, 3.4 cm of sepal width, 1.4 cm of petal length and 0.3 cm of petal width is setosa
7          The flower with 5.0 cm of sepal length, 3.4 cm of sepal wi

In [10]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [11]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [12]:
from sklearn import metrics

def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = metrics.accuracy_score(y_true=labels, y_pred=pred)
    recall = metrics.recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = metrics.precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = metrics.f1_score(y_true=labels, y_pred=pred, average='micro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

#### Fine-tuning

In [13]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=1,
    per_device_train_batch_size=8

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [14]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=15, training_loss=1.0538679758707683, metrics={'train_runtime': 127.0202, 'train_samples_per_second': 0.945, 'train_steps_per_second': 0.118, 'total_flos': 2775024327600.0, 'train_loss': 1.0538679758707683, 'epoch': 1.0})

In [15]:
trainer.evaluate()

<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.7977274060249329,
 'eval_accuracy': 1.0,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 4.4884,
 'eval_samples_per_second': 6.684,
 'eval_steps_per_second': 0.891,
 'epoch': 1.0}

In [16]:
np.set_printoptions(suppress=True)

#### Tests

In [17]:
def get_predictions(model, tokenizer, text) :
  inputs = tokenizer(text, padding = True, truncation = True, return_tensors='pt') # .to('cuda')
  outputs = model(**inputs)
  predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
  predictions = predictions.cpu().detach().numpy()
  return predictions

In [18]:
def accuracy_predictions(X, y, model, tokenizer) :
  cpt = 0
  for text, label in zip(X, y) :
    predictions = get_predictions(model, tokenizer, text)[0]
    index_max = predictions.argmax()
    if index_max == label :
      cpt += 1
  return cpt / len(y)

In [19]:
# First row, Expected : Setosa, Got : Setosa
text = "The flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width"
print(get_predictions(model, tokenizer, text))

# Row 51, Expected : Versicolor, Got : Setosa
text = "The flower with 7.0 cm of sepal length, 3.2 cm of sepal width, 4.7 cm of petal length and 1.4 cm of petal width"
print(get_predictions(model, tokenizer, text))

# Row 101, Expected : Virginica, Got : Virginica
text = "The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width"
print(get_predictions(model, tokenizer, text))

[[0.29870337 0.36094555 0.34035105]]
[[0.29853427 0.36039737 0.34106833]]
[[0.29880163 0.3596052  0.34159327]]


In [20]:
# Expected : Setosa, Got : Setosa
text = "setosa"
print(get_predictions(model, tokenizer, text))

# Expected : Versicolor, Got : Versicolor
text = "versicolor"
print(get_predictions(model, tokenizer, text))

# Expected : Virginica, Got : Virginica
text = "virginica"
print(get_predictions(model, tokenizer, text))

[[0.41006058 0.2845194  0.30542   ]]
[[0.29544485 0.37491223 0.32964292]]
[[0.31339547 0.3141908  0.37241375]]


In [21]:
iris_text_df = data_to_df_text(iris, subject_name="flower", has_unit=True, select=1)
X = list(iris_text_df)
y = list(iris['target'])

accuracy_predictions(X, y, model, tokenizer)

0.3333333333333333

#### Sauvegarde du modèle

In [None]:
save_path="./fine-tuned-bert-iris"
trainer.save_model(save_path)

#### Chargement du modèle

In [None]:
save_path="./fine-tuned-bert-iris"
model_2 = BertForSequenceClassification.from_pretrained(save_path)

In [None]:
text = "The flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is a"
get_predictions(model_2, tokenizer, text)

array([[0.7248513 , 0.09858475, 0.17656392]], dtype=float32)