# Project 2: Generating Data For Preference Tuning

This guides builds upon the week 1 notebooks for [getting started](https://colab.research.google.com/drive/1oyFm3hftLneEQlUK19SELlkvUguCFrw_?usp=sharing) and [the project](https://colab.research.google.com/drive/1mje_YUaKGFpgmrACwpFLCyqWuIMN76Pe?usp=sharing). Feel free to take a look at those notebooks to freshen up your knowledge.

First, install the required dependencies.

In [None]:
!pip install "distilabel[hf-inference-endpoints]" "huggingface-hub" "textdescriptives" "gliclass" "numpy==1.26.4" -U -q

Next, we can log in to the Hugging Face Hub and configure our token through the login method.

In [None]:
from huggingface_hub import login

login()

In [None]:
from huggingface_hub import login
from google.colab import userdata

login(token=(userdata.get('HF_TOKEN')))

## 1. Generate a dataset for preference tuning

### Basic dataset

We can generate data with two different model sizes where we assume the bigger one results in better results.

In [None]:
from distilabel.models.llms.huggingface import InferenceEndpointsLLM
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromDicts
from distilabel.steps.tasks import TextGeneration

with Pipeline() as pipeline:
    data = LoadDataFromDicts(
        data=[{"instruction": "Generate a short question about data science."}]
    )
    llm_worse = InferenceEndpointsLLM(
        base_url="https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct"
    )
    llm_better = InferenceEndpointsLLM(
        base_url="https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-3B-Instruct"
    )
    gen_a = TextGeneration(llm=llm_worse, output_mappings={"generation": "instruction"})
    gen_b = TextGeneration(llm=llm_better, output_mappings={"generation": "rejected"})
    gen_c = TextGeneration(llm=llm_better, output_mappings={"generation": "chosen"})
    data >> gen_a >> gen_b >> gen_c

if __name__ == "__main__":
    username = "uplimit"
    distiset = pipeline.run(use_cache=False)
    distiset.push_to_hub(f"{username}/uplimit-synthetic-data-week-2-basic")

### With response evolution

We can evolve the responses, where a evolved respone is likely of better quality and therefore the preferred answer.

In [None]:
from distilabel.models.llms.huggingface import InferenceEndpointsLLM
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromDicts
from distilabel.steps.tasks import EvolQuality, TextGeneration

with Pipeline() as pipeline:
    llm = InferenceEndpointsLLM(
        base_url="https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-3B-Instruct",
    )
    data = LoadDataFromDicts(
        data=[{"instruction": "Generate a short question about data science."}]
    )
    gen_a = TextGeneration(
        llm=llm,
        output_mappings={"generation": "rejected"},
    )
    gen_b = EvolQuality(
        llm=llm,
        num_evolutions=1,
        input_mappings={"response": "rejected"},
        output_mappings={"evolved_response": "chosen"},
    )
    data >> gen_a >> gen_b

if __name__ == "__main__":
    username = "uplimit"
    distiset = pipeline.run(use_cache=False)
    distiset.push_to_hub(
        f"{username}/uplimit-synthetic-data-week-2-with-evol",
        include_script=True,
    )

### With critiques

We can now also add generation for using differen models and then evaluate them using an LLM as a judge. Good judges can be found in [allenai/reward-bench](https://huggingface.co/spaces/allenai/reward-bench) or the [AtlaAI/judge-arena](https://huggingface.co/spaces/AtlaAI/judge-arena).

In [None]:
from distilabel.models.llms.huggingface import InferenceEndpointsLLM
from distilabel.pipeline import Pipeline
from distilabel.steps import GroupColumns, LoadDataFromDicts
from distilabel.steps.tasks import TextGeneration, UltraFeedback

with Pipeline() as pipeline:
    data = LoadDataFromDicts(
        data=[{"instruction": "Generate a short question about data science."}]
    )
    llm_gen_a = InferenceEndpointsLLM(
        base_url="https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-3B-Instruct"
    )
    llm_gen_b = InferenceEndpointsLLM(
        base_url="https://api-inference.huggingface.co/models/Qwen/Qwen2.5-1.5B-Instruct"
    )
    gen_a = TextGeneration(llm=llm_gen_a, output_mappings={"generation": "instruction"})
    gen_b = TextGeneration(llm=llm_gen_a)
    gen_c = TextGeneration(llm=llm_gen_b)
    llm_judge = InferenceEndpointsLLM(
        base_url="https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
    )
    group_columns = GroupColumns(
        name="combine_columns",
        columns=[
            "generation",
            "model_name",
        ],
        output_columns=[
            "generations",
            "generation_models",
        ],
        input_batch_size=2,
    )
    judge = UltraFeedback(
        llm=llm_judge,
        aspect="overall-rating",
    )
    data >> gen_a >> [gen_b, gen_c] >> group_columns >> judge

if __name__ == "__main__":
    username = "uplimit"
    distiset = pipeline.run(use_cache=False)
    distiset.push_to_hub(f"{username}/uplimit-synthetic-data-week-2-critique")

### With multi-turn

In [None]:
from distilabel.models.llms.huggingface import InferenceEndpointsLLM
from distilabel.pipeline import Pipeline
from distilabel.steps.tasks import ChatGeneration, MagpieGenerator

with Pipeline() as pipeline:
    llm_magpie = InferenceEndpointsLLM(
        base_url="https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
        tokenizer_id="meta-llama/Llama-3.3-70B-Instruct",
        magpie_pre_query_template="llama3",
        use_magpie_template=True,
        generation_kwargs={"max_new_tokens": 2000},
    )
    llm_worse = InferenceEndpointsLLM(
        base_url="https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-3B-Instruct",
        generation_kwargs={"max_new_tokens": 2000},
    )
    llm_better = InferenceEndpointsLLM(
        base_url="https://api-inference.huggingface.co/models/meta-llama/Llama-3.3-70B-Instruct",
        generation_kwargs={"max_new_tokens": 2000},
    )
    gen_a = MagpieGenerator(llm=llm_magpie, end_with_user=True, num_rows=1, n_turns=2)
    gen_b = ChatGeneration(
        llm=llm_worse,
        input_mappings={"messages": "conversation"},
        output_mappings={"generation": "rejected"},
    )
    gen_c = ChatGeneration(
        llm=llm_better,
        input_mappings={"messages": "conversation"},
        output_mappings={"generation": "chosen"},
    )
    gen_a >> gen_b >> gen_c

if __name__ == "__main__":
    distiset = pipeline.run(use_cache=False)
    distiset.push_to_hub(
        "uplimit/uplimit-synthetic-data-week-2-with-multi-turn",
        include_script=True,
    )

## 2. Explore and evaluate the generated data

We will nog show some basic data exploration on top of the [argilla/distilabel-intel-orca-dpo-pairs](https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs) dataset. This dataset had some original preference pairs and distilabel was used to provide new ratings and evaluate the correctness of the provided pairs.


In [None]:
from datasets import load_dataset

dataset = load_dataset("argilla/distilabel-intel-orca-dpo-pairs")
dataset

README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/79.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12859 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['system', 'input', 'chosen', 'rejected', 'generations', 'order', 'labelling_model', 'labelling_prompt', 'raw_labelling_response', 'rating', 'rationale', 'status', 'original_chosen', 'original_rejected', 'chosen_score', 'in_gsm8k_train'],
        num_rows: 12859
    })
})

We know, that there is a column that includes rating, which we can use as simple quality filters, so let's focus on all examples where the `chosen` rating is larger than 8.

In [None]:
dataset_quality = dataset.filter(
    lambda r:
        not r["chosen_score"] is None and
        r["chosen_score"] >= 8
)
dataset_quality

Filter:   0%|          | 0/12859 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['system', 'input', 'chosen', 'rejected', 'generations', 'order', 'labelling_model', 'labelling_prompt', 'raw_labelling_response', 'rating', 'rationale', 'status', 'original_chosen', 'original_rejected', 'chosen_score', 'in_gsm8k_train'],
        num_rows: 9499
    })
})

Similarly, we can focus on examples where the difference between chosen and rejected scores is large enough so the model can distinguish between them, while ensuring the difference is not too big.

In [None]:
dataset_quality_diff = dataset_quality.filter(
    lambda r:
        not r["rating"] is None and
        (max(r["rating"]) - min(r["rating"])) > 1 and
        (max(r["rating"]) - min(r["rating"])) < 4
)
dataset_quality_diff

Filter:   0%|          | 0/9499 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['system', 'input', 'chosen', 'rejected', 'generations', 'order', 'labelling_model', 'labelling_prompt', 'raw_labelling_response', 'rating', 'rationale', 'status', 'original_chosen', 'original_rejected', 'chosen_score', 'in_gsm8k_train'],
        num_rows: 1468
    })
})

Great, now we have a good starting subset based off our rating, we can continue to analyse the quality using some additional metrics. We will be using [`text-descriptives`](https://github.com/HLasse/TextDescriptives) to get some quality metrics for our texts. Specifically we will focus on the `flesch_reading_ease`, which indicates how easy a text is to read, where higher scores indicate easier reading.

In [None]:
import textdescriptives as td


def process_quality(batch):
    df = td.extract_metrics(
        text=batch["chosen"],
        lang="en",
        metrics=["readability"]
    )
    batch["flesch_reading_ease"] = df["flesch_reading_ease"].values
    return batch


dataset_quality_diff_reading = dataset_quality_diff.map(
    process_quality, batched=True
)

Next, we will also be adding some general domain classification using [gliclass](https://github.com/Knowledgator/GLiClass), which is an optimised zero-shot classification model. To further speed up this process, we will also be using a [ModernBERT architecture model](https://huggingface.co/knowledgator/gliclass-modern-large-v2.0-init) which is around 4x quicker on CPU.

In [None]:
from gliclass import GLiClassModel, ZeroShotClassificationPipeline
from transformers import AutoTokenizer

repo_id = "knowledgator/gliclass-modern-base-v2.0-init"
model = GLiClassModel.from_pretrained(repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id)

pipeline = ZeroShotClassificationPipeline(
    model,
    tokenizer,
    device="cuda"
)

We then define out labels and run our batched inference. We do recommend using GPUs for running this part of the pipeline.

In [None]:
labels = [
    "business and industrial",
    "books and literature",
    "home and garden",
    "adult",
    "science",
    "food and drink",
    "real estate",
    "news",
    "jobs and education",
    "health",
    "hobbies and leisure",
    "games",
    "beauty and fitness",
    "computers and electronics",
    "arts and entertainment",
    "travel and transportation",
    "finance",
    "law and government",
    "sensitive subjects",
    "autos and vehicles",
    "people and society",
    "sports",
    "shopping",
    "online communities",
    "pets and animals",
    "internet and telecom",
]

def process_domains(batch):
    results = pipeline(batch["input"], labels, threshold=0)
    batch["domain"] = [max(res, key=lambda x: x['score'])["label"]
 if res else None for res in results]
    return batch

dataset_quality_diff_reading_domain = dataset_quality_diff_reading.map(
    process_domains, batched=True
)
dataset_quality_diff_reading_domain

### 3. Publish the dataset on Hugging Face

Publish the dataset on the Hugging Face Hub. Make sure you update the ModelCard with all relevant information. Additionally, link the exploratory data analysis and preferably also add an analysis script.

In [None]:
from datasets import Dataset

dataset_quality_diff_reading_domain.push_to_hub("uplimit/uplimit-synthetic-data-week-2-filtered")

Now, we can go to our dataset page and update the ModelCard with all relevant information. Additionally, we can add an analysis script. A good example for this is the original card attached to [argilla/distilabel-intel-orca-dpo-pairs](https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs).