In [18]:
import kfp
import os
import requests

from kfp.dsl import Input, Model, component, Dataset, Output, Artifact
from kfp.dsl import InputPath, OutputPath, pipeline, component, PipelineTask
from kfp.components import load_component_from_file

In [19]:
@component(
    base_image="python:3.11",
    packages_to_install=["accelerate", "transformers[torch]", "transformers", "datasets", "huggingface_hub"]
)
def download_model_hf(model_name: str, dataset_name: str, model_archive: Output[Artifact], data_archive: Output[Artifact]) -> None:
    from transformers import AutoTokenizer, AutoModelForCausalLM
    from datasets import load_dataset

    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
        
    # Format: turn (question + schema) -> cypher
    def data_formatter(example):
        prompt = f"## Schema:\n{example['schema']}\n\n## Question:\n{example['question']}\n\nCypher:\n"
        return {
            "prompt": prompt,
            "completion": example["cypher"]
        }

    # Tokenize
    def tokenize_function(example):
        return tokenizer(
            example["prompt"] + example["completion"],
            truncation=True,
            padding=False,
            max_length=512
        )

    # Ensure labels = input_ids (common for causal LM)
    def format_for_training(example):
        example["labels"] = example["input_ids"]
        return example

    model.save_pretrained(model_archive.path)
    tokenizer.save_pretrained(model_archive.path)
    
    dataset = load_dataset(dataset_name)
    dataset = dataset.map(data_formatter)
    dataset = dataset.map(tokenize_function)
    dataset = dataset.map(format_for_training)

    dataset.save_to_disk(data_archive.path)

In [20]:
@component(
    base_image="python:3.11",
    packages_to_install=["accelerate", "transformers[torch]", "transformers", "datasets", "huggingface_hub", "peft", "evaluate"]
)
def train_model(model_archive: Input[Artifact], data_archive: Input[Artifact], trained_model_archive: Output[Artifact]) -> None:
    from datasets import DatasetDict, Dataset, load_from_disk
    from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
    from peft import get_peft_model, LoraConfig, TaskType
    import evaluate
    
    model = AutoModelForCausalLM.from_pretrained(model_archive.path)
    
    dataset = load_from_disk(data_archive.path)
    dataset_train = dataset["train"].shuffle(seed=47)
    dataset_test = dataset["test"].shuffle(seed=47)

    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],  # adjust based on your model
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    training_args = TrainingArguments(
        output_dir=trained_model_archive.path,
        eval_strategy="epoch",
        push_to_hub=False,
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset_train,
        eval_dataset=dataset_test,
    )
    
    trainer.train()

In [21]:
@pipeline(name='finetune-pipeline')
def finetune_pipeline(model_name: str, dataset_name: str) -> None:
    hf_download_op = download_model_hf(model_name=model_name, dataset_name=dataset_name)
    training_op = train_model(model_archive=hf_download_op.outputs['model_archive'], data_archive=hf_download_op.outputs['data_archive'])

In [22]:
client = kfp.Client()

run = client.create_run_from_pipeline_func(finetune_pipeline, arguments={"model_name": "codellama/CodeLlama-7b-hf", "dataset_name": "neo4j/text2cypher-2025v1"}, enable_caching=True)