In [1]:
!pip3 install torch transformers datasets accelerate

Collecting torch
  Using cached torch-2.6.0-cp313-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting transformers
  Using cached transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
Collecting datasets
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting accelerate
  Using cached accelerate-1.3.0-py3-none-any.whl.metadata (19 kB)
Using cached torch-2.6.0-cp313-none-macosx_11_0_arm64.whl (66.5 MB)
Using cached transformers-4.48.3-py3-none-any.whl (9.7 MB)
Using cached datasets-3.2.0-py3-none-any.whl (480 kB)
Using cached accelerate-1.3.0-py3-none-any.whl (336 kB)
Installing collected packages: torch, accelerate, transformers, datasets
Successfully installed accelerate-1.3.0 datasets-3.2.0 torch-2.6.0 transformers-4.48.3


In [12]:
import torch
from datasets import DatasetDict, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer

# Define model name
MODEL_NAME = "NousResearch/Llama-3.2-1B"

# Define SYSTEM prompt
SYSTEM_PROMPT = """SYSTEM: 
You are a ServiceNow validator for approving change requests. You need to validate the following criteria:

Description:
- Must contain why this change is required.
- How the team will benefit and which team will benefit.
- Must contain start and end date.
- If the timeframe between start and end date is only 1 day, highlight it.

Implementation:
1. Must contain CI pipeline build URL.
2. Must contain Deployment contact person.
3. Must contain CD pipeline URL.
4. Must contain secrets folder path or URL.
5. Taking backup of previous build details is mandatory.

Validation:
1. Must contain health check endpoints for the app.
2. Must contain smoke test scripts location.
3. Must have Service registry URL for the app.

Backout Plan:
1. Must explain how to revert using Kube Helm charts.
2. Must explain how to backup the current build and deploy the previous backup build.
3. Must include the previous build CI URL.

Evidence of Testing:
1. Test reports must be attached for all environments.
2. All tests must be conducted in the lower environment.
3. Must have lower environment sign-off SPOC name.

USER:
"""

# Define training and test data
data_samples = {
    "train": [
        {"text": "{\"description\": \"Change improves CI/CD efficiency. DevOps benefits.\", \"implementation\": {\"ci_pipeline_url\": \"https://ci.example.com/build/1234\", \"deployment_contact\": \"john.doe@example.com\", \"cd_pipeline_url\": \"https://cd.example.com/deploy/5678\", \"secrets_folder\": \"/secrets/app/\", \"backup_details\": \"Backup before deployment.\"}, \"validation\": {\"health_check_endpoint\": \"https://app.example.com/health\", \"smoke_test_scripts\": \"/tests/smoke_tests/\", \"service_registry_url\": \"https://registry.example.com/service\"}, \"backout_plan\": {\"helm_revert\": \"Use helm rollback.\", \"backup_restore\": \"Restore previous backup.\", \"previous_ci_url\": \"https://ci.example.com/build/5678\"}, \"testing_evidence\": {\"test_reports\": \"/reports/test_results.json\", \"test_environment\": \"Lower env tested.\", \"signoff_spoc\": \"jane.doe@example.com\"}}", "label": "Approved"},
        {"text": "{\"description\": \"Security update required.\", \"implementation\": {\"ci_pipeline_url\": \"https://ci.example.com/build/2345\", \"deployment_contact\": \"admin@example.com\"}}", "label": "Rejected"}
    ],
    "test": [
        {"text": "{\"description\": \"Performance improvement for frontend team.\", \"implementation\": {\"ci_pipeline_url\": \"https://ci.example.com/build/3456\", \"deployment_contact\": \"alex@example.com\", \"cd_pipeline_url\": \"https://cd.example.com/deploy/7890\", \"secrets_folder\": \"/secrets/frontend/\", \"backup_details\": \"Backup before deploy.\"}, \"validation\": {\"health_check_endpoint\": \"https://frontend.example.com/health\", \"smoke_test_scripts\": \"/tests/frontend_smoke/\", \"service_registry_url\": \"https://registry.example.com/frontend\"}, \"backout_plan\": {\"helm_revert\": \"Use helm rollback.\", \"backup_restore\": \"Restore previous backup.\", \"previous_ci_url\": \"https://ci.example.com/build/7890\"}, \"testing_evidence\": {\"test_reports\": \"/reports/frontend_results.json\", \"test_environment\": \"Lower env tested.\", \"signoff_spoc\": \"mike@example.com\"}}", "label": "Approved"},
        {"text": "{\"description\": \"Monitoring update.\", \"implementation\": {\"ci_pipeline_url\": \"https://ci.example.com/build/9876\", \"deployment_contact\": \"security@example.com\"}}", "label": "Rejected"}
    ]
}

# Convert to Hugging Face Dataset
dataset = DatasetDict({key: Dataset.from_list(value) for key, value in data_samples.items()})

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token  # Fix padding issue
tokenizer.padding_side = "right"

# Preprocess function
def preprocess_function(examples):
    encoding = tokenizer(
        examples["text"],  # Prepend SYSTEM prompt
        padding="max_length",
        truncation=True,
        max_length=512,  # Increased max_length for Llama compatibility
        return_tensors="pt"
    )
    encoding["label"] = label_mapping[examples["label"]]  # Assign label mapping
    return encoding

# Convert labels to numerical format
label_mapping = {"Approved": 1, "Rejected": 0}

# Tokenize dataset
tokenized_datasets = dataset.map(preprocess_function, batched=False)

# Debugging: Print tokenized output
print("Sample tokenized data:", tokenized_datasets["train"][0])

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=2, torch_dtype=torch.float16, device_map="auto"
)

# Fix model padding issue
model.config.pad_token_id = tokenizer.pad_token_id

# Training arguments
training_args = TrainingArguments(
    output_dir="./llama-classifier",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=1,  # Reduce batch size for Llama model compatibility
    per_device_eval_batch_size=1,
    logging_dir="./logs",
    num_train_epochs=3,
    save_total_limit=2
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer
)

# Train model
trainer.train()

# Save model
trainer.save_model("./llama-classifier-model")


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Sample tokenized data: {'text': '{"description": "Change improves CI/CD efficiency. DevOps benefits.", "implementation": {"ci_pipeline_url": "https://ci.example.com/build/1234", "deployment_contact": "john.doe@example.com", "cd_pipeline_url": "https://cd.example.com/deploy/5678", "secrets_folder": "/secrets/app/", "backup_details": "Backup before deployment."}, "validation": {"health_check_endpoint": "https://app.example.com/health", "smoke_test_scripts": "/tests/smoke_tests/", "service_registry_url": "https://registry.example.com/service"}, "backout_plan": {"helm_revert": "Use helm rollback.", "backup_restore": "Restore previous backup.", "previous_ci_url": "https://ci.example.com/build/5678"}, "testing_evidence": {"test_reports": "/reports/test_results.json", "test_environment": "Lower env tested.", "signoff_spoc": "jane.doe@example.com"}}', 'label': 1, 'input_ids': [[128000, 5018, 4789, 794, 330, 4164, 36050, 21351, 14, 6620, 15374, 13, 6168, 40004, 7720, 10684, 330, 14706, 794, 532

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at NousResearch/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some parameters are on the meta device because they were offloaded to the disk.
  trainer = Trainer(
You shouldn't move a model that is dispatched using accelerate hooks.


RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.