<a href="https://colab.research.google.com/github/soberbichler/DHd_Workshop_2026/blob/main/Job_Huggingface.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Running LLM Jobs via HuggingFace

For explanations on Hugginface https://huggingface.co/docs/huggingface_hub/guides/jobs



##Requirements for Hugging Face Jobs



*   Hugging Face Pro account - A paid subscription is required to access job creation features
*   Write access token - Generate a token with write permissions from your account settings
*   Valid payment method - Jobs consume compute credits based on usage


##Authentication Setup



*   Create your access token at huggingface.co/settings/tokens (you will be given an API as part of the workshop)
*   Ensure the token has "Write" permissions enabled


##Prepare your HF Job Script:

This script creates a remote computational job on HuggingFace's infrastructure that loads a language model and answers a question. It uses `run_job` to spin up a GPU-enabled Docker container (PyTorch with CUDA), installs necessary Python packages (transformers, accelerate, etc.), then runs a Python script that loads a chosen model and defines the task.

# Analyze a Dataset



1.   Run the first cell and upload the dataset "Earthquake_Articles_your_name.csv" Add your name to the dataset as well as in the code.
2.   Add the narrativ event detection prompt
3.   Run the second cell to monitor the job. While the code is running, investigate the code. Can you explain what the code is doing?






In [None]:
from huggingface_hub import run_job, upload_file
from google.colab import files
import os

# Upload CSV
uploaded = files.upload()
local_filename = list(uploaded.keys())[0]
safe_filename = "earthquake_articles_your_name.csv"
os.rename(local_filename, safe_filename)

# Upload to HF
from google.colab import userdata
HF_TOKEN = userdata.get("HF_TOKEN")
upload_file(
    path_or_fileobj=safe_filename,
    path_in_repo=safe_filename,
    repo_id="oberbics/jobs",
    repo_type="dataset",
    token=HF_TOKEN
)
print(f"Dataset uploaded: {safe_filename}")

# Submit job
job = run_job(
    image="pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel",
    command=[
        "bash", "-c",
        f"""
        apt-get update && apt-get install -y wget &&
        pip install -q "transformers>=4.51.0" accelerate bitsandbytes huggingface_hub pandas &&
        wget -O earthquake_articles_your_name.csv https://huggingface.co/datasets/oberbics/jobs/resolve/main/earthquake_articles_your_name.csv &&
        export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True &&
        export HF_TOKEN='{HF_TOKEN}' &&
        python3 -c "
import os, torch, pandas as pd, datetime, re, json
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import upload_file

model_name = 'mistralai/Mistral-Small-24B-Instruct-2501'

SYSTEM_PROMPT = '''Here prompt'''

HF_TOKEN = os.environ.get('HF_TOKEN')

df = pd.read_csv('earthquake_articles_your_name.csv', sep=';')
print(f'Dataset loaded with {{len(df)}} rows')

print('Loading model...')
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN, fix_mistral_regex=True)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    token=HF_TOKEN
)
print('Model loaded successfully!')

def generate_extraction(model, tokenizer, text_to_analyze):
    user_instruction = 'Extract agent/action/patient triples from this text:'
    messages = [
        {{'role': 'system', 'content': SYSTEM_PROMPT}},
        {{'role': 'user', 'content': user_instruction + text_to_analyze[:5500]}}
    ]
    full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(full_prompt, return_tensors='pt', truncation=True, max_length=5048).to(model.device)
    input_length = inputs['input_ids'].shape[1]
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5000,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    generated_tokens = outputs[0][input_length:]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
    return response

def parse_json_response(response):
    try:
        cleaned = re.sub(r'```json|```', '', response).strip()
        start = cleaned.find('[')
        end = cleaned.rfind(']') + 1
        if start != -1 and end > start:
            cleaned = cleaned[start:end]
        triples = json.loads(cleaned)
        return triples, True
    except Exception as e:
        print(f'  JSON parse error: {{str(e)}}')
        return [], False

results = []
parse_success = 0
parse_failed = 0

for idx, row in df.iterrows():
    text = str(row.get('article_text', ''))
    if pd.isna(text) or text.strip() in ['nan', '']:
        print(f'Row {{idx}}: empty, skipping')
        continue
    print(f'Processing row {{idx}}...')
    try:
        response = generate_extraction(model, tokenizer, text)
        triples, success = parse_json_response(response)

        if success:
            parse_success += 1
            print(f'  Extracted {{len(triples)}} triples')
            result_row = row.to_dict()
            result_row.update({{
                'llm_raw_response': response[:3000],
                'triples_json': json.dumps(triples, ensure_ascii=False),
                'triple_count': len(triples),
                'processed_row_index': idx,
                'model_used': model_name
            }})
            results.append(result_row)
        else:
            parse_failed += 1
            result_row = row.to_dict()
            result_row.update({{
                'llm_raw_response': response[:3000],
                'triples_json': 'PARSE_ERROR',
                'triple_count': 0,
                'processed_row_index': idx,
                'model_used': model_name
            }})
            results.append(result_row)

    except Exception as e:
        print(f'  Error: {{str(e)}}')
        parse_failed += 1
        result_row = row.to_dict()
        result_row.update({{
            'llm_raw_response': 'ERROR',
            'triples_json': f'ERROR: {{str(e)}}',
            'triple_count': 0,
            'processed_row_index': idx,
            'model_used': model_name
        }})
        results.append(result_row)

total = parse_success + parse_failed
if total > 0:
    print('='*50)
    print(f'PARSE SUCCESS: {{parse_success}}/{{total}} ({{parse_success/total*100:.1f}}%) | Failed: {{parse_failed}}/{{total}}')
    print('='*50)

output_df = pd.DataFrame(results)
output_df.to_csv('output_triples.csv', index=False, sep=',', quoting=1)
print(f'Saved {{len(output_df)}} triple rows to CSV')

timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'llm_triples_{{timestamp}}.csv'
try:
    upload_file(
        path_or_fileobj='output_triples.csv',
        path_in_repo=filename,
        repo_id='oberbics/jobs',
        repo_type='dataset',
        token=HF_TOKEN,
        commit_message=f'Mistral agent/action/patient extraction - {{timestamp}}'
    )
    print(f'SUCCESS: https://huggingface.co/datasets/oberbics/jobs/resolve/main/{{filename}}')
except Exception as e:
    print(f'Upload failed: {{str(e)}}')

print(f'Job complete. Processed {{len(df)}} articles -> {{len(output_df)}} triples')
"
        """
    ],
    flavor="a100-large",
    timeout="4h",
    env={"HUGGINGFACE_TOKEN": HF_TOKEN}
)

print(f"Job submitted! ID: {job.id}")
print(f"Monitor at: https://huggingface.co/jobs/oberbics/{job.id}")

In [None]:
from huggingface_hub import inspect_job, fetch_job_logs
import time

# Poll job status until it's done
while True:
    status = inspect_job(job_id=job.id).status.stage
    print(f"Job status: {status}")
    if status in ("COMPLETED", "ERROR"):
        break
    time.sleep(10)

# Fetch logs after completion
print("\n=== Job logs ===")
logs = list(fetch_job_logs(job_id=job.id))
for line in logs:
    print(line)
