In [126]:
import numpy as np
import pandas as pd
import re
import os 

# Importations datasets
from sklearn import datasets

# Pré-modèles !
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [86]:
# -------------------------------------------
# TRANSFORMATION DATAFRAME

# Transformation de la valeur en texte
def text_type_select(value, feature, select=0) :
    """ Retourne un type de texte selon la sélection
    0. <value> <feature>
    1. <value> <unit> of <name> (avec feature = (<name>, <unit>))

    Args:
        value (float): la valeur du feature
        feature (str): le nom de feature
        select (int, optional): Type de texte à sélectionner. Defaults to 0.

    Returns:
        str: Texte retourné
    """
    if select == 0 :
        return str(value) + " " + str(feature)
    if select == 1 :
        name, unit = feature
        return str(value) + " " + str(unit) + " of " + str(name)
    
# Transformation d'une ligne en texte
def df_row_to_text(Xi:pd.DataFrame, yi:pd.DataFrame, feature_names, subject_name = "subject", has_unit=False) :
    """ Pour une ligne d'un DataFrame, génère un texte expliquant la ligne

    Args:
        `df` (pd.DataFrame): Le DataFrame
        `row_num` (int): La ligne du DataFrame dont on veut générer la description en texte
        `label_num` (int, optional): La colonne contenant le label. Defaults to -1 (la dernière colonne).
        `subject_name` (str, optional): Le nom du sujet pour l'affichage. Defaults to "subject".
        `has_unit` (bool, optional): _description_. Defaults to True.

    Returns:
        str: Le texte généré
    """

    # features name - unit
    if has_unit :
        try :
            regex_unit_pattern = r'([\w ]+)\s+\((\w+)\)$'
            feature_names = [re.search(regex_unit_pattern, feature).groups() for feature in feature_names] 
            has_unit_vector = [True if len(tuple) == 2 else False for tuple in feature_names]
        except :
            has_unit_vector = [False for _ in feature_names]
    else :
        has_unit_vector = [False for _ in feature_names]

    # generate text type
    value_feature_text_list = [text_type_select(value, feature, select=1) if has_unit_vector[i] \
                               else text_type_select(value, feature, select=0) \
                               for i, (value, feature) in enumerate(zip(Xi, feature_names))]

    # generate text
    text = "The " + subject_name + " with "
    for vf in value_feature_text_list[:-1] :
        text += vf + ", "
    text = text[:-2]
    text += " and " + value_feature_text_list[-1]
    text += " is " + str(yi) # a / an 

    return text

# Liste de textes
def df_texts_list(X:pd.DataFrame, y:pd.DataFrame, **kwargs) :
    """_summary_

    Args:
        df (pd.DataFrame): _description_
        label_num (int, optional): _description_. Defaults to -1.
        subject_name (str, optional): _description_. Defaults to "subject".
        has_unit (bool, optional): _description_. Defaults to False.

    Returns:
        _type_: _description_
    """
    return [df_row_to_text(X.iloc[i], y.iloc[i], X.columns, **kwargs) for i in range(len(X))]

# DataFrame Texte (Main)
def data_to_df_text(data, **kwargs) :
    if 'feature_names' in data :
        X = pd.DataFrame(data['data'], columns=data['feature_names'])
    else :
        X = pd.DataFrame(data['data'])
    y = pd.Series(data['target'])
    if 'target_names' in data :
        y = y.map({i:label for i, label in enumerate(data['target_names'])})
    return pd.Series(df_texts_list(X, y, **kwargs))

# DataFrame Display (Main)
def display_df(df:pd.DataFrame) :
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_colwidth', None)
    print(df)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_colwidth')

# -------------------------------------------
# TRAINING

# Training (Main)
def train_clf(clf, X_train, y_train) :
    clf.fit(X_train, y_train) 
    print(clf.predict(X_train))

# -------------------------------------------
# TRAITEMENT

# Récupérer la liste de features et le nom de label de la question
def question_to_list(q) :
    pattern = r'be (.*) \?'
    match = re.search(pattern, q)
    a_label = match.group(1)

    pattern = r'have (.*), what'
    match = re.search(pattern, q.strip())
    a_features = match.group(1)
    a_list_features = a_features.split(',')
    a_list_features = [feature.strip() for feature in a_list_features]

    return a_list_features, a_label

# Transformer en DataFrame pour faire passer dans le calcul de prédiction
def question_to_df(q) :
    a_list_features, _ = question_to_list(q)
    a_list_features_split = [feature.split('=') for feature in a_list_features]
    a_features_names = [feature[0] for feature in a_list_features_split]
    try : 
        a_features_values = [float(feature[1]) for feature in a_list_features_split]
    except :
        a_features_values = [feature[1] for feature in a_list_features_split]
    return pd.DataFrame(data=np.array([a_features_values]), columns=a_features_names)

# Prédiction de la réponse
def q_df_to_answer(clf, df) :
    return clf.predict(df)[0]

# Prédiction de la réponse
def answer_to_text(q, a) :
    _, a_label = question_to_list(q)
    pattern = r'what .* \?'
    qa = re.sub(pattern, '', q.strip())
    return qa.strip() + " " + a_label + " is " + str(a)

# Réponse du programme à partir de la question
def traitement_question(clf, q) :
    q_df = question_to_df(q)
    a = q_df_to_answer(clf, q_df)
    return answer_to_text(q, a)

In [121]:
# Fine-tuning du modèle
def train_premodel(model_name="gpt2", dataset_path="./iris_dataset.txt", save_path="./fine-tuned-gpt2") :

    # Load pre-trained GPT-2 model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    dataset = TextDataset(tokenizer=tokenizer, file_path=dataset_path, block_size=128)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=save_path,
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,
    )

    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    model.save_pretrained(save_path)

# Récupérer le model et le tokenizer
def get_model_tokenizer(model_path="./fine-tuned-gpt2", tokenizer_path="gpt2") :
    # Load fine-tuned GPT-2 model and tokenizer
    model_path = model_path
    tokenizer_path = tokenizer_path
    model = GPT2LMHeadModel.from_pretrained(model_path, local_files_only=True)
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return model, tokenizer

# Tester le modèle
def test_premodel(model, tokenizer, prompt="The flower with", max_length=100, temperature=1, top_k=1) :

    # Generate text samples
    prompt = "The flower with"
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, temperature=temperature, top_k=top_k) # top_k=50

    # Decode generated output
    generated_texts = tokenizer.batch_decode(output, skip_special_tokens=True)

    # Print generated texts
    for i, text in enumerate(generated_texts):
        print(f"Generated Text {i+1}: {text}")

## 1. Préparation du Dataset

In [70]:
# 1. Chargement du Dataset Iris

iris = datasets.load_iris()
print([key for key in iris])
print(iris['feature_names'])
print(iris['target_names'])

X = pd.DataFrame(iris['data'], columns=iris['feature_names'])
y = pd.Series(iris['target']).map({i:label for i, label in enumerate(iris['target_names'])})

['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']


## 2. Fabrication du Dataset de textes

In [71]:
# 2.1. Transformation d'une ligne

row = 145
print(df_row_to_text(iris['data'][row], iris['target'][row], iris['feature_names'], subject_name="flower", has_unit=False))
print(df_row_to_text(iris['data'][row], iris['target'][row], iris['feature_names'], subject_name="flower", has_unit=True))

The flower with 6.7 sepal length (cm), 3.0 sepal width (cm), 5.2 petal length (cm) and 2.3 petal width (cm) is 2
The flower with 6.7 cm of sepal length, 3.0 cm of sepal width, 5.2 cm of petal length and 2.3 cm of petal width is 2


In [72]:
# 2.2. Conversion des samples en texte

iris_text_list = df_texts_list(X, y, subject_name="flower", has_unit=True)

for text in iris_text_list[:5] + iris_text_list[-5:] :
    print(text)

The flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is setosa
The flower with 4.9 cm of sepal length, 3.0 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is setosa
The flower with 4.7 cm of sepal length, 3.2 cm of sepal width, 1.3 cm of petal length and 0.2 cm of petal width is setosa
The flower with 4.6 cm of sepal length, 3.1 cm of sepal width, 1.5 cm of petal length and 0.2 cm of petal width is setosa
The flower with 5.0 cm of sepal length, 3.6 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is setosa
The flower with 6.7 cm of sepal length, 3.0 cm of sepal width, 5.2 cm of petal length and 2.3 cm of petal width is virginica
The flower with 6.3 cm of sepal length, 2.5 cm of sepal width, 5.0 cm of petal length and 1.9 cm of petal width is virginica
The flower with 6.5 cm of sepal length, 3.0 cm of sepal width, 5.2 cm of petal length and 2.0 cm of petal width is virginica
The flower with

In [87]:
# 2.3. Création d'un nouveau DF

iris_text_df = data_to_df_text(iris, subject_name="flower", has_unit=True)
iris_text_df.head(10)

0    The flower with 5.1 cm of sepal length, 3.5 cm...
1    The flower with 4.9 cm of sepal length, 3.0 cm...
2    The flower with 4.7 cm of sepal length, 3.2 cm...
3    The flower with 4.6 cm of sepal length, 3.1 cm...
4    The flower with 5.0 cm of sepal length, 3.6 cm...
5    The flower with 5.4 cm of sepal length, 3.9 cm...
6    The flower with 4.6 cm of sepal length, 3.4 cm...
7    The flower with 5.0 cm of sepal length, 3.4 cm...
8    The flower with 4.4 cm of sepal length, 2.9 cm...
9    The flower with 4.9 cm of sepal length, 3.1 cm...
dtype: object

In [97]:
# Autres Exemples

display_df(data_to_df_text(datasets.load_digits(), subject_name="number", has_unit=False).head())
# display_df(data_to_df_text(datasets.load_diabetes(), subject_name="diabetes type", has_unit=False).head()) # Is regression problem
display_df(data_to_df_text(datasets.load_wine(), subject_name="wine", has_unit=False).head())
display_df(data_to_df_text(datasets.load_breast_cancer(), subject_name="breast cancer type", has_unit=False).head())

0        The number with 0.0 pixel_0_0, 0.0 pixel_0_1, 5.0 pixel_0_2, 13.0 pixel_0_3, 9.0 pixel_0_4, 1.0 pixel_0_5, 0.0 pixel_0_6, 0.0 pixel_0_7, 0.0 pixel_1_0, 0.0 pixel_1_1, 13.0 pixel_1_2, 15.0 pixel_1_3, 10.0 pixel_1_4, 15.0 pixel_1_5, 5.0 pixel_1_6, 0.0 pixel_1_7, 0.0 pixel_2_0, 3.0 pixel_2_1, 15.0 pixel_2_2, 2.0 pixel_2_3, 0.0 pixel_2_4, 11.0 pixel_2_5, 8.0 pixel_2_6, 0.0 pixel_2_7, 0.0 pixel_3_0, 4.0 pixel_3_1, 12.0 pixel_3_2, 0.0 pixel_3_3, 0.0 pixel_3_4, 8.0 pixel_3_5, 8.0 pixel_3_6, 0.0 pixel_3_7, 0.0 pixel_4_0, 5.0 pixel_4_1, 8.0 pixel_4_2, 0.0 pixel_4_3, 0.0 pixel_4_4, 9.0 pixel_4_5, 8.0 pixel_4_6, 0.0 pixel_4_7, 0.0 pixel_5_0, 4.0 pixel_5_1, 11.0 pixel_5_2, 0.0 pixel_5_3, 1.0 pixel_5_4, 12.0 pixel_5_5, 7.0 pixel_5_6, 0.0 pixel_5_7, 0.0 pixel_6_0, 2.0 pixel_6_1, 14.0 pixel_6_2, 5.0 pixel_6_3, 10.0 pixel_6_4, 12.0 pixel_6_5, 0.0 pixel_6_6, 0.0 pixel_6_7, 0.0 pixel_7_0, 0.0 pixel_7_1, 6.0 pixel_7_2, 13.0 pixel_7_3, 10.0 pixel_7_4, 0.0 pixel_7_5, 0.0 pixel_7_6 and 0.0 pixel_7_

## 3. Avec modèles classiques et datasets de base

In [75]:
# 3. Modèle : DecisionTreeClassifier

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()

train_clf(clf, X, y)

['setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'v

In [76]:
q = X.iloc[0:1]
print(q)
clf.predict(q)

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2


array(['setosa'], dtype=object)

In [77]:
q = "When we have sepal length (cm)=0.3, sepal width (cm)=0.4, petal length (cm)=0.1, petal width (cm)=0.2, what should be the iris type ?"

print("Question -->", q)
print("Réponse -->", traitement_question(clf, q))

Question --> When we have sepal length (cm)=0.3, sepal width (cm)=0.4, petal length (cm)=0.1, petal width (cm)=0.2, what should be the iris type ?
Réponse --> When we have sepal length (cm)=0.3, sepal width (cm)=0.4, petal length (cm)=0.1, petal width (cm)=0.2, the iris type is setosa


## 4. Avec modèles de langage pré-entraînés (GPT-2)

In [100]:
# 4. Ecriture dans un fichier dataset.txt
# Parce qu'on fait passer un fichier de texte pour le training et non un dataset avec un texte et un label

with open('ft_models/iris_dataset.txt', 'w') as file :
    for text in iris_text_df :
        file.write(text + '.\n')

In [114]:
with open('ft_models/counter.txt', 'r') as file :
    counter = file.read().strip()
print(counter)
with open('ft_models/counter.txt', 'w') as file :
    file.write(str(int(counter)+1))

folder = './ft_models/'
folder_model = 'model' + counter + '/'
model_name = 'gpt2'
dataset_path = 'iris_dataset.txt'
save_path = 'fine-tuned-gpt2'

path = os.path.join(folder, folder_model) 
os.mkdir(path) 

print("Directory '% s' created" % (folder + folder_model)) 
print("model_name : %s" % model_name)
print("dataset_path : %s" % folder + dataset_path)
print("save_path : %s" % folder + folder_model + save_path)

2
Directory './ft_models/model2/' created
model_name : gpt2
dataset_path : ./ft_models/iris_dataset.txt
save_path : ./ft_models/model2/fine-tuned-gpt2


In [None]:
# 5. Hugging Face GPT-2 Training (mettre un temps pour calculer le temps du training)
# 15 min sur le dataset iris

train_premodel(
    model_name = model_name, 
    dataset_path = folder + dataset_path, 
    save_path = folder + folder_model + save_path
)

In [123]:
# 6.1. Hugging Face GPT-2 Testing

model, tokenizer = get_model_tokenizer(
    model_path = folder + folder_model + save_path,
    tokenizer_path = model_name
)

loading configuration file ./ft_models/model2/fine-tuned-gpt2\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.14.1",
  "use_cache": true,
  "vocab_size": 502

All model checkpoint weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized from the model checkpoint at ./ft_models/model2/fine-tuned-gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.
loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at C:\Users\sunse/.cache\huggingface\transformers\684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at C:\Users\sunse/.cache\huggingface\transformers\c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2/resolv

In [125]:
# 6.2. Hugging Face GPT-2 Testing

# ---------------------------------
# First row : The flower with 5.1 cm of sepal length, 3.5 cm of sepal width, 1.4 cm of petal length and 0.2 cm of petal width is a setosa.
# Row 51 : The flower with 7.0 cm of sepal length, 3.2 cm of sepal width, 4.7 cm of petal length and 1.4 cm of petal width is a versicolor.
# Row 101 : The flower with 6.3 cm of sepal length, 3.3 cm of sepal width, 6.0 cm of petal length and 2.5 cm of petal width is a virginica.
# ---------------------------------

test_premodel(model, tokenizer, "The flower with")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text 1: The flower with 5.4 cm of sepal length, 3.0 cm of sepal width, 1.5 cm of petal length and 1.5 cm of petal width is versicolor.
The flower with 5.0 cm of sepal length, 3.0 cm of sepal width, 1.5 cm of petal length and 1.5 cm of petal width is versicolor.
The flower with 6.0 cm of sepal length,


## 5. A suivre

In [None]:

# 7. Follow Paper protocols

# 8. Fix errors