In [18]:
from distilabel.llms import TransformersLLM
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadHubDataset
from distilabel.steps.tasks import TextGeneration
import torch

with Pipeline(
    name="simple-text-generation-pipeline",
    description="A simple text generation pipeline",
) as pipeline:
    load_dataset = LoadHubDataset(
        name="load_dataset",
        output_mappings={"text": "instruction"},
    )

    generate_with_hf = TextGeneration(
        name="generate_with_llama3-8b", llm=TransformersLLM(model="../../models/", device="cuda:0", torch_dtype="bfloat16")
    )

    load_dataset.connect(generate_with_hf)

if __name__ == "__main__":
    distiset = pipeline.run(
        parameters={
            "load_dataset": {
                "repo_id": "skvarre/BibblanSvarar-Questions",
                "split": "train",
            },
            "generate_with_llama3-8b": {
                "llm": {
                    "generation_kwargs": {
                        "temperature": 0.4,
                        "max_new_tokens": 1024,
                    }
                }
            },
        },
    )

In [None]:
import os

from distilabel.pipeline import Pipeline
from distilabel.llms import TransformersLLM
from distilabel.steps.tasks.text_generation import TextGeneration
from distilabel.steps.tasks.self_instruct import SelfInstruct

text_generation = SelfInstruct(
    name="text-generation",
    llm=TransformersLLM(
        model="meta-llama/Meta-Llama-3-8B-Instruct",
        device="cuda:0"
        # api_key=os.getenv("MISTRALAI_API_KEY"),  # type: ignore
    ),
    num_instructions=5,
    input_batch_size=8,
    pipeline=Pipeline(name="self-instruct-pipeline")
)

# remember to call .load() if testing outside of a Pipeline context
text_generation.load()


In [None]:
result = next(
    text_generation.process(
        [
            {
                "input": "Write questions in Swedish.",
            },
        ] 
    )
)

In [None]:
print(result)


In [1]:
from datasets import load_dataset

dataset = load_dataset("/home/edgelab/.cache/distilabel/pipelines/simple-text-generation-pipeline/60382e526244ed9e97c581db9d7a05192538ec5f/data/generate_with_llama3-8b")

Resolving data files:   0%|          | 0/82 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
data = dataset['train']

data['generation']

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,