<a href="https://colab.research.google.com/github/somosnlp/recursos/blob/main/hackathon_2024/taller_distilabel_y_argilla.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Argilla y distilabel, herramientas para crear modelos como Notus

In [None]:
%pip install argilla distilabel[vllm,argilla] -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.3/254.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for ftfy (setup.py) ... [?25l[?25hdone


## Carga del dataset de ejemplo

En esta sección crearemos un pequeño dataset de ejemplo para crear nuestros datasets para SFT y DPO. Cogeremos 500 filas del dataset de la Wikipedia en Español y aplicaremos un pequeño limpiado.

In [None]:
from datasets import Dataset, load_dataset

In [None]:
wikipedia = load_dataset("wikimedia/wikipedia", "20231101.es", split="train", streaming=True)

In [None]:
wikipedia = wikipedia.rename_column("text", "content").shuffle()

In [None]:
def is_disambiguation_article(text: str) -> bool:
    for disam_text in [
        "puede hacer referencia",
        "hace referencia a",
        "puede referirse"
    ]:
        if disam_text in text:
            return True

    return False

In [None]:
def article_length_between_boundaries(text: str, min_length: int, max_length: int) -> bool:
    text_len = len(text)
    return text_len >= min_length and text_len <= max_length

In [None]:
def clean_text(text: str) -> str:
    for split_text in ["\n\nEnlaces externos", "\n\nReferencias", "\n\nBibliografía"]:
        text = text.split(split_text)[0]
    return text

In [None]:
def get_subset(
    dataset,
    article_min_length: int,
    article_max_length: int,
    num_articles: int
) -> list[dict]:
    subset = []
    for row in wikipedia:
        text = row["content"]
        if (
            not is_disambiguation_article(text)
            and article_length_between_boundaries(text, article_min_length, article_max_length)
        ):
            row["content"] = clean_text(text)
            subset.append(row)
        if len(subset) >= num_articles:
            break
    return subset

In [None]:
subset = get_subset(
    wikipedia,
    article_min_length=512,
    article_max_length=1024,
    num_articles=500
)

In [None]:
dataset = Dataset.from_list(subset)

## Dataset para SFT utilizando Genstruct 7B

En esta sección generaremos el dataset para SFT utilizando [distilabel](https://github.com/argilla-io/distilabel) y Genstruct 7B. Además, subiremos el dataset generado al Hugging Face Hub 🤗 y a Argilla para realizar una inspección manual de los datos generados.

[Genstruct 7B](https://huggingface.co/NousResearch/Genstruct-7B) es una LLM creada por Nous Research que sirve para generar instrucciones y sus consecuentes generaciones dado un corpus.


Esta LLM espera un prompt con el siguiente formato:

```plain
[[[Title]]] célula
[[[Content]]] En el ámbito de la biología, es la unidad más pequeña que puede vivir por sí sola. Forma todos los organismos vivos y los tejidos del cuerpo. Las tres partes principales de la célula son la membrana celular, el núcleo y el citoplasma. La membrana celular rodea la célula y controla las sustancias que entran y salen. Dentro de la célula está el núcleo que contiene el nucléolo y la mayor parte del ADN celular, además es donde se produce casi todo el ARN. El citoplasma es el líquido del interior de la célula que contiene otros elementos diminutos con funciones específicas, como el aparato de Golgi, las mitocondrias y el retículo endoplasmático. En el citoplasma ocurren la mayoría de las reacciones químicas, y también es donde se elaboran la mayoría de las proteínas. El cuerpo humano tiene más de 30 billones de células.

[[[User]]]
```

y generará algo similar a esto:

```plain

```

In [None]:
from vllm import LLM, SamplingParams

### `Task` para utilizar Genstruct con `distilabel`

In [None]:
import re
from typing import Any, Optional
from dataclasses import dataclass
from distilabel.tasks.text_generation.base import TextGenerationTask
from distilabel.tasks.prompt import Prompt
from jinja2 import Template

In [None]:
GENSTRUCT_TEMPLATE = """[[[Title]]] {{ title }}
[[[Content]]] {{ content }}

{{ system_prompt }}

{% if messages -%}{% for message in messages %}[[[{{ message.role.capitalize() }}]]] {{ message.message }}\n\n{% endfor %}{% set last_role = messages[-1].role %}{% if last_role == 'user' %}[[[Assistant]]]{% else %}[[[User]]]{% endif %}
{%- else -%}
[[[User]]]{% endif %}
"""

In [None]:
PARSE_OUTPUT_REGEX = re.compile(r"\[\[\[(User|Assistant)\]\]\](.*?)(?=\[\[\[User\]\]\]|\[\[\[Assistant\]\]\]|$)", re.DOTALL)

@dataclass
class GenstructTask(TextGenerationTask):
    @property
    def template(self) -> Template:
        return Template(GENSTRUCT_TEMPLATE)

    def generate_prompt(
        self, title: str, content: str, messages: Optional[list[dict[str, str]]] = None, **_: Any
    ) -> str:
        return self.template.render(
            title=title,
            content=content,
            system_prompt=self.system_prompt,
            messages=messages,
        )

    @property
    def input_args_names(self) -> list[str]:
        return ["title", "content", "messages"]

    @property
    def output_args_names(self) -> list[str]:
        return ["conversation"]

    def parse_output(self, output: str) -> dict[str, list[str]]:
        if "[[[Assistant]]]" in output and not output.lstrip().startswith("[[[User]]]"):
            output = "[[[User]]]" + output

        messages = []
        for match in PARSE_OUTPUT_REGEX.finditer(output):
            message = match.group(2).strip()
            messages.append(message)

        return {"conversation": messages}

In [None]:
from distilabel.pipeline import Pipeline
from distilabel.llm import vLLM

In [None]:
model = LLM(model="gabrielmbmb/Genstruct-7B-AWQ")

INFO 03-19 10:05:47 llm_engine.py:87] Initializing an LLM engine with config: model='gabrielmbmb/Genstruct-7B-AWQ', tokenizer='gabrielmbmb/Genstruct-7B-AWQ', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 03-19 10:05:53 weight_utils.py:163] Using model weights format ['*.safetensors']
INFO 03-19 10:06:19 llm_engine.py:357] # GPU blocks: 2120, # CPU blocks: 2048
INFO 03-19 10:06:22 model_runner.py:684] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 03-19 10:06:22 model_runner.py:688] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory

In [None]:
genstruct_task = GenstructTask(
    system_prompt="The following is an interaction between a user that only knows Spanish and an Spanish AI assistant that is related to the above text.",
)

In [None]:
generator = vLLM(
    model=model,
    task=genstruct_task,
    max_new_tokens=512,
    temperature=0.7,
    prompt_formatting_fn=lambda x: x
)

In [None]:
pipeline = Pipeline(generator=generator)

In [None]:
input_dataset = dataset.select_columns(["title", "content"]).add_column("messages", [[]] * len(dataset))

In [None]:
def generate_sft_dataset(dataset: Dataset) -> Dataset:
    return pipeline.generate(dataset=input_dataset, batch_size=50)

In [None]:
distiset_iter_0 = generate_sft_dataset(input_dataset)

INFO:distilabel:Executing dry-run...
INFO:distilabel:Processing batch 1 of 1...
INFO:distilabel:Calling generator for batch 1...
  prompts = self._generate_prompts(inputs, default_format=None)


Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:distilabel:Dry-run executed with no issues. Starting the actual generation...


Output()

INFO:distilabel:Processing batch 1 of 10...
INFO:distilabel:Calling generator for batch 1...


INFO:distilabel:Processing batch 2 of 10...
INFO:distilabel:Calling generator for batch 2...
INFO:distilabel:Processing batch 3 of 10...
INFO:distilabel:Calling generator for batch 3...
INFO:distilabel:Processing batch 4 of 10...
INFO:distilabel:Calling generator for batch 4...
INFO:distilabel:Processing batch 5 of 10...
INFO:distilabel:Calling generator for batch 5...
INFO:distilabel:Processing batch 6 of 10...
INFO:distilabel:Calling generator for batch 6...
INFO:distilabel:Processing batch 7 of 10...
INFO:distilabel:Calling generator for batch 7...
INFO:distilabel:Processing batch 8 of 10...
INFO:distilabel:Calling generator for batch 8...
INFO:distilabel:Processing batch 9 of 10...
INFO:distilabel:Calling generator for batch 9...
INFO:distilabel:Processing batch 10 of 10...
INFO:distilabel:Calling generator for batch 10...


Flattening the indices:   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

INFO:distilabel:Checkpoint saved to disk: /content/ckpt.
INFO:distilabel:Final dataset saved at /content/ckpt


In [None]:
def process_results(row: dict) -> dict:
    messages = row["messages"]
    for i, message in enumerate(row["conversation"][0]):
        messages.append({
            "role": "user" if i % 2 == 0 else "assistant",
            "message": message
        })

    return {
        "title": row["title"],
        "content": row["content"],
        "messages": messages
    }

In [None]:
input_dataset_iter_1 = distiset_iter_0.map(process_results)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

### Subir el dataset a Argilla

In [None]:
import argilla as rg

In [None]:
rg.init(api_url="https://gabrielmbmb-somosnlp-workshop.hf.space", api_key="owner.apikey")



In [None]:
def create_chat_html(messages):
    chat_html = ""
    for message in messages:
        role = message['role']
        content = message['message']

        # Determine the CSS for styling based on the role
        if role == 'user':
            css_class = 'user-message'
            align = 'right'
        else:
            css_class = 'assistant-message'
            align = 'left'

        # Create the HTML message div with inline styles
        message_html = f'<div style="display: flex; justify-content: {align}; margin: 10px;">'
        message_html += f'<div style="background-color: {"#c2e3f7" if role == "user" else "#f5f5f5"}; padding: 10px; border-radius: 10px; max-width: 70%; word-wrap: break-word;">{content}</div>'
        message_html += '</div>'

        # Add the message to the chat HTML
        chat_html += message_html

    return chat_html

In [None]:
def create_genstruct_dataset_in_argilla() -> rg.FeedbackDataset:
    return rg.FeedbackDataset(
        fields=[
            rg.TextField(name="title", title="Título"),
            rg.TextField(name="content", title="Contenido"),
            rg.TextField(name="conversation", title="Conversación", use_markdown=True),
        ],
        questions=[
            rg.RatingQuestion(name="score", title="Puntuación", values=[1,2,3,4,5]),
            rg.LabelQuestion(
                name="use_context",
                title="¿Se ha utilizado la información de contexto en la conversación generada?",
                labels={"yes": "✅ Sí", "no": "❌ No"}
            )
        ]
    )


In [None]:
def create_argilla_dataset(dataset: Dataset) -> rg.FeedbackDataset:
    rg_dataset = create_genstruct_dataset_in_argilla()
    records = []
    for row in dataset:
        records.append(
            rg.FeedbackRecord(
                fields={
                    "title": row["title"],
                    "content": row["content"],
                    "conversation": create_chat_html(row["messages"])
                }
            )
        )
    rg_dataset.add_records(records)
    return rg_dataset

In [None]:
rg_dataset_iter_0 = create_argilla_dataset(input_dataset_iter_1)

In [None]:
rg_dataset_iter_0.push_to_argilla(name="wikipedia_genstruct_iter_0", workspace="admin")

Output()

INFO:argilla.client.feedback.dataset.local.mixins:✓ Dataset succesfully pushed to Argilla
INFO:argilla.client.feedback.dataset.local.mixins:RemoteFeedbackDataset(
   id=92018a4b-bddb-44ec-a573-a09708cb3218
   name=wikipedia_genstruct_iter_0
   workspace=Workspace(id=d9d35e36-08d1-47fd-b8b7-2aacb01da517, name=admin, inserted_at=2024-03-19 08:53:36.809218, updated_at=2024-03-19 08:53:36.809218)
   url=https://gabrielmbmb-somosnlp-workshop.hf.space/dataset/92018a4b-bddb-44ec-a573-a09708cb3218/annotation-mode
   fields=[RemoteTextField(id=UUID('ca52aefc-49f4-4b82-ac7d-7138ab285090'), client=None, name='title', title='Título', required=True, type='text', use_markdown=False), RemoteTextField(id=UUID('6d175920-5756-4b1d-bb49-bf77a3c2798a'), client=None, name='content', title='Contenido', required=True, type='text', use_markdown=False), RemoteTextField(id=UUID('55633996-d015-48a4-b5df-4016b11c00e6'), client=None, name='conversation', title='Conversación', required=True, type='text', use_markdo

RemoteFeedbackDataset(
   id=92018a4b-bddb-44ec-a573-a09708cb3218
   name=wikipedia_genstruct_iter_0
   workspace=Workspace(id=d9d35e36-08d1-47fd-b8b7-2aacb01da517, name=admin, inserted_at=2024-03-19 08:53:36.809218, updated_at=2024-03-19 08:53:36.809218)
   url=https://gabrielmbmb-somosnlp-workshop.hf.space/dataset/92018a4b-bddb-44ec-a573-a09708cb3218/annotation-mode
   fields=[RemoteTextField(id=UUID('ca52aefc-49f4-4b82-ac7d-7138ab285090'), client=None, name='title', title='Título', required=True, type='text', use_markdown=False), RemoteTextField(id=UUID('6d175920-5756-4b1d-bb49-bf77a3c2798a'), client=None, name='content', title='Contenido', required=True, type='text', use_markdown=False), RemoteTextField(id=UUID('55633996-d015-48a4-b5df-4016b11c00e6'), client=None, name='conversation', title='Conversación', required=True, type='text', use_markdown=True)]
   questions=[RemoteRatingQuestion(id=UUID('bb5125b4-c3f7-4199-a206-858ec882d132'), client=None, name='score', title='Puntuación',

### Segunda iteracción para conversaciones más largas

Utilizando la lista de mensajes generados en la primera ejecucción, crearemos un nuevo dataset repitiendo el proceso e incluyendo dichos mensajes para generar una conversación con varios turnos que nos puede servir para fine-tunear una LLM que pueda funcionar mejor en chats.

In [None]:
distiset_iter_1 = generate_sft_dataset(input_dataset_iter_1)

INFO:distilabel:Executing dry-run...
INFO:distilabel:Processing batch 1 of 1...
INFO:distilabel:Calling generator for batch 1...
  prompts = self._generate_prompts(inputs, default_format=None)


Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:distilabel:Dry-run executed with no issues. Starting the actual generation...
INFO:distilabel:Processing batch 1 of 10...
INFO:distilabel:Calling generator for batch 1...


INFO:distilabel:Processing batch 2 of 10...
INFO:distilabel:Calling generator for batch 2...
INFO:distilabel:Processing batch 3 of 10...
INFO:distilabel:Calling generator for batch 3...
INFO:distilabel:Processing batch 4 of 10...
INFO:distilabel:Calling generator for batch 4...
INFO:distilabel:Processing batch 5 of 10...
INFO:distilabel:Calling generator for batch 5...
INFO:distilabel:Processing batch 6 of 10...
INFO:distilabel:Calling generator for batch 6...
INFO:distilabel:Processing batch 7 of 10...
INFO:distilabel:Calling generator for batch 7...
INFO:distilabel:Processing batch 8 of 10...
INFO:distilabel:Calling generator for batch 8...
INFO:distilabel:Processing batch 9 of 10...
INFO:distilabel:Calling generator for batch 9...
INFO:distilabel:Processing batch 10 of 10...
INFO:distilabel:Calling generator for batch 10...


Flattening the indices:   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

INFO:distilabel:Checkpoint saved to disk: /content/ckpt.
INFO:distilabel:Final dataset saved at /content/ckpt


In [None]:
input_dataset_iter_2 = distiset_iter_1.map(process_results)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
rg_dataset_iter_1 = create_argilla_dataset(distiset_iter_1)

In [None]:
rg_dataset_iter_1.push_to_argilla(name="wikipedia_genstruct_iter_1", workspace="admin")

Output()

INFO:argilla.client.feedback.dataset.local.mixins:✓ Dataset succesfully pushed to Argilla
INFO:argilla.client.feedback.dataset.local.mixins:RemoteFeedbackDataset(
   id=ac3cefcb-1127-4dd8-9994-deb29769f750
   name=wikipedia_genstruct_iter_1
   workspace=Workspace(id=d9d35e36-08d1-47fd-b8b7-2aacb01da517, name=admin, inserted_at=2024-03-19 08:53:36.809218, updated_at=2024-03-19 08:53:36.809218)
   url=https://gabrielmbmb-somosnlp-workshop.hf.space/dataset/ac3cefcb-1127-4dd8-9994-deb29769f750/annotation-mode
   fields=[RemoteTextField(id=UUID('972263e7-bc3c-4d51-adb9-79bd8e0795d2'), client=None, name='title', title='Título', required=True, type='text', use_markdown=False), RemoteTextField(id=UUID('8609bd0d-c9f3-4664-a7ac-2411053424e7'), client=None, name='content', title='Contenido', required=True, type='text', use_markdown=False), RemoteTextField(id=UUID('712d3f51-3e25-4640-b85d-42809e66a5af'), client=None, name='conversation', title='Conversación', required=True, type='text', use_markdo

RemoteFeedbackDataset(
   id=ac3cefcb-1127-4dd8-9994-deb29769f750
   name=wikipedia_genstruct_iter_1
   workspace=Workspace(id=d9d35e36-08d1-47fd-b8b7-2aacb01da517, name=admin, inserted_at=2024-03-19 08:53:36.809218, updated_at=2024-03-19 08:53:36.809218)
   url=https://gabrielmbmb-somosnlp-workshop.hf.space/dataset/ac3cefcb-1127-4dd8-9994-deb29769f750/annotation-mode
   fields=[RemoteTextField(id=UUID('972263e7-bc3c-4d51-adb9-79bd8e0795d2'), client=None, name='title', title='Título', required=True, type='text', use_markdown=False), RemoteTextField(id=UUID('8609bd0d-c9f3-4664-a7ac-2411053424e7'), client=None, name='content', title='Contenido', required=True, type='text', use_markdown=False), RemoteTextField(id=UUID('712d3f51-3e25-4640-b85d-42809e66a5af'), client=None, name='conversation', title='Conversación', required=True, type='text', use_markdown=True)]
   questions=[RemoteRatingQuestion(id=UUID('e02f1b1a-7376-4879-865a-935292ac6694'), client=None, name='score', title='Puntuación',

## Dataset para DPO

En esta sección, de nuevo utilizando `distilabel`, generaremos un dataset que se puede utilizar para hacer fine-tune de una LLM mediante DPO. Para ello, primero generaremos un dataset con varias respuestas para la misma instrucción, y después utilizando una LLM más potente y la tarea de `UltraFeedback`, le pediremos que nos valore las respuestas en base a la instrucción.

In [None]:
from distilabel.llm import InferenceEndpointsLLM
from distilabel.tasks import UltraFeedbackTask

In [None]:
input_dataset_for_dpo = input_dataset.select(range(10))

In [None]:
distiset_responses = pipeline.generate(
    dataset=input_dataset_for_dpo,
    batch_size=10,
    num_generations=3
)

INFO:distilabel:Executing dry-run...
INFO:distilabel:Processing batch 1 of 1...
INFO:distilabel:Calling generator for batch 1...
  prompts = self._generate_prompts(inputs, default_format=None)


Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:distilabel:Dry-run executed with no issues. Starting the actual generation...
INFO:distilabel:Processing batch 1 of 1...
INFO:distilabel:Calling generator for batch 1...


Flattening the indices:   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

INFO:distilabel:Checkpoint saved to disk: /content/ckpt.
INFO:distilabel:Final dataset saved at /content/ckpt


In [None]:
def process_dataset_for_ultrafeedback(row: dict) -> dict:
    instruction = ""
    generations = []
    for conversation in row["conversation"]:
        if len(conversation) == 2:
            instruction, generation = conversation
            generations.append(generation)

    return {
        "input": instruction,
        "generations": generations,
    }

In [None]:
input_dataset_for_ultrafeedback = distiset_responses.map(process_dataset_for_ultrafeedback)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
ultrafeedback_task = UltraFeedbackTask.for_overall_quality()

In [None]:
inference_endpoints_llm = InferenceEndpointsLLM(
    endpoint_name_or_model_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
    max_new_tokens=2048,
    task=ultrafeedback_task
)

INFO:distilabel:Using Serverless Inference Endpoint


In [None]:
pipeline = Pipeline(labeller=inference_endpoints_llm)

In [None]:
dpo_dataset = pipeline.generate(dataset=input_dataset_for_ultrafeedback)

INFO:distilabel:Executing dry-run...
INFO:distilabel:Processing batch 1 of 1...
INFO:distilabel:Calling labeller for batch 1...
  prompts = self._generate_prompts(inputs, default_format=None)


Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

INFO:distilabel:Dry-run executed with no issues. Starting the actual generation...
INFO:distilabel:Processing batch 1 of 10...
INFO:distilabel:Calling labeller for batch 1...


INFO:distilabel:Processing batch 2 of 10...
INFO:distilabel:Calling labeller for batch 2...
INFO:distilabel:Processing batch 3 of 10...
INFO:distilabel:Calling labeller for batch 3...
INFO:distilabel:Processing batch 4 of 10...
INFO:distilabel:Calling labeller for batch 4...
INFO:distilabel:Processing batch 5 of 10...
INFO:distilabel:Calling labeller for batch 5...
INFO:distilabel:Processing batch 6 of 10...
INFO:distilabel:Calling labeller for batch 6...
INFO:distilabel:Processing batch 7 of 10...
INFO:distilabel:Calling labeller for batch 7...
INFO:distilabel:Processing batch 8 of 10...
INFO:distilabel:Calling labeller for batch 8...
INFO:distilabel:Processing batch 9 of 10...
INFO:distilabel:Calling labeller for batch 9...
INFO:distilabel:Processing batch 10 of 10...
INFO:distilabel:Calling labeller for batch 10...


Flattening the indices:   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

INFO:distilabel:Checkpoint saved to disk: /content/ckpt.
INFO:distilabel:Final dataset saved at /content/ckpt


In [None]:
dpo_dataset.push_to_hub("gabrielmbmb/wikipedia_genstruct_dpo")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/855 [00:00<?, ?B/s]

INFO:distilabel:Pushing task to the hub...


In [None]:
rg_argilla = dpo_dataset.to_argilla()

generations-3
  field required (type=value_error.missing)
  rg_argilla = dpo_dataset.to_argilla()


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/65.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/745 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/34.8M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[38;5;4mℹ The specified spaCy model "en_core_web_md" was not              found
on disk. Downloading...[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Output()

In [None]:
rg_argilla.push_to_argilla(name="wikipedia_genstruct_dpo", workspace="admin")

Output()

INFO:argilla.client.feedback.dataset.local.mixins:✓ Dataset succesfully pushed to Argilla
INFO:argilla.client.feedback.dataset.local.mixins:RemoteFeedbackDataset(
   id=4fd461ca-99df-4ccd-b693-8eea21bda2da
   name=wikipedia_genstruct_dpo
   workspace=Workspace(id=d9d35e36-08d1-47fd-b8b7-2aacb01da517, name=admin, inserted_at=2024-03-19 08:53:36.809218, updated_at=2024-03-19 08:53:36.809218)
   url=https://gabrielmbmb-somosnlp-workshop.hf.space/dataset/4fd461ca-99df-4ccd-b693-8eea21bda2da/annotation-mode
   fields=[RemoteTextField(id=UUID('f2a72635-b387-4e07-8412-fd145c157899'), client=None, name='input', title='input', required=True, type='text', use_markdown=True), RemoteTextField(id=UUID('24653197-c836-462b-81b9-5c4e212c793f'), client=None, name='generations-1', title='generations-1', required=True, type='text', use_markdown=True), RemoteTextField(id=UUID('96bdfaa2-5cf8-471c-9854-257c224bd14c'), client=None, name='generations-2', title='generations-2', required=True, type='text', use_

RemoteFeedbackDataset(
   id=4fd461ca-99df-4ccd-b693-8eea21bda2da
   name=wikipedia_genstruct_dpo
   workspace=Workspace(id=d9d35e36-08d1-47fd-b8b7-2aacb01da517, name=admin, inserted_at=2024-03-19 08:53:36.809218, updated_at=2024-03-19 08:53:36.809218)
   url=https://gabrielmbmb-somosnlp-workshop.hf.space/dataset/4fd461ca-99df-4ccd-b693-8eea21bda2da/annotation-mode
   fields=[RemoteTextField(id=UUID('f2a72635-b387-4e07-8412-fd145c157899'), client=None, name='input', title='input', required=True, type='text', use_markdown=True), RemoteTextField(id=UUID('24653197-c836-462b-81b9-5c4e212c793f'), client=None, name='generations-1', title='generations-1', required=True, type='text', use_markdown=True), RemoteTextField(id=UUID('96bdfaa2-5cf8-471c-9854-257c224bd14c'), client=None, name='generations-2', title='generations-2', required=True, type='text', use_markdown=True), RemoteTextField(id=UUID('a66164ef-7a7e-44e5-ac74-e9ce59436c18'), client=None, name='generations-3', title='generations-3', r