In [None]:
import  digitalhub as dh

project = dh.get_or_create_project("llmpa-test")

## 1. Create and Preprocess dataset

In [None]:
func = project.new_function(
    name="create-dataset", 
    kind="python", 
    python_version="PYTHON3_10", 
    code_src="src/dataset_preprocessing.py",  
    handler="create_and_log_dataset",
    requirements=["datasets[audio]==4.1.0", "transformers==4.52.0"]
)

In [None]:
train_run = func.run(action="job",
                     parameters={
                         "model_id": "openai/whisper-small",
                         "dataset_name": "audio-dataset",
                         "hf_dataset_name": "mozilla-foundation/common_voice_17_0",
                         "language": "Italian",
                         "language_code": "it"
                     },
                     secrets=["HF_TOKEN"],
                     envs=[
                        {"name": "HF_HOME", "value": "shared/data/huggingface"},
                        {"name": "TRANSFORMERS_CACHE", "value":  "shared/data/huggingface"}
                     ],
                     volumes=[{
                        "volume_type": "persistent_volume_claim",
                        "name": "volume-llmpa",
                        "mount_path": "/shared/data",
                        "spec": { "size": "300Gi" }}]
					)

## 2. Fine-tuning

In [None]:
func = project.new_function(
    name="train-whisper", 
    kind="python", 
    python_version="PYTHON3_10", 
    code_src="src/whisper_fine_tuning.py",  
    handler="train_and_log_model",
    requirements=["datasets[audio]=4.1.0", "transformers==4.52.0", "torch==2.8.0", "accelerate==1.10.1", "evaluate==0.4.5", "jiwer==4.0.0", "tensorboard==2.20.0"]
)

In [None]:
train_run = func.run(action="job",
                     parameters={
                         "model_id": "openai/whisper-small",
                         "model_name": "whisper-ft",
                         "artifact_name": "audio-dataset",
                         "language": "Italian",
                         "language_code": "it"
                     },
                     profile="1xa100",
                     secrets=["HF_TOKEN"],
                     envs=[
                        {"name": "HF_HOME", "value": "shared/data/huggingface"},
                        {"name": "TRANSFORMERS_CACHE", "value":  "shared/data/huggingface"}
                     ],
                     volumes=[{
                        "volume_type": "persistent_volume_claim",
                        "name": "volume-llmpa",
                        "mount_path": "/shared/data",
                        "spec": { "size": "100Gi" }}]
					)