In [2]:
import transformers
from datasets import load_dataset
from torch.optim import Adam
from torch.utils.data import DataLoader
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
import tqdm
import torch

import sys
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "/model/saved/"

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
            model_path, load_in_8bit=True, device_map={"":0}, trust_remote_code=True)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.9
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Prepare for training

In [12]:
import os 

data_path = os.environ.get('DATA_PATH', "../sample-data/k8s-instructions.jsonl")
data = load_dataset("json", data_files=data_path)
data

Found cached dataset json (/root/.cache/huggingface/datasets/json/default-6ffb87a3d7edc3ab/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 4
    })
})

In [13]:
lora_config2 = LoraConfig(
 r=8,
 lora_alpha=32,
 target_modules=["query_key_value"],
 lora_dropout=0.05,
 bias="none",
 task_type="CAUSAL_LM"
)

# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config2)
model.print_trainable_parameters()



trainable params: 2,359,296 || all params: 6,924,080,000 || trainable%: 0.03407378308742822


In [15]:

def prompt(instruction, output):
    prompt = "{0}\n\n{1}\n{2}\n\n{3}\n{4}".format(
        "Below is an instruction that describes a task. Write a response that appropriately completes the request.",
        "### Instruction:",
        instruction,
        "### Response:",
        output
    )
    return prompt

In [16]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
print(data)
data = data.map(lambda x: tokenizer(prompt(x["prompt"], x["completion"]),
    max_length=1000, padding=True, truncation=True))
print("After tokenizing:", data)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 4
    })
})


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

After tokenizing: DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4
    })
})


In [22]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        num_train_epochs=2,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="./outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

trainer.save_model("/model/trained")

Step,Training Loss
1,0.9619
2,0.9645


In [23]:
! ls -lash /model/trained

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 9.1M
4.0K drwxr-xr-x 1 root root 4.0K Jun 19 02:57 .
8.0K drwxr-xr-x 1 root root 4.0K Jun 19 02:32 ..
4.0K -rw-r--r-- 1 root root   27 Jun 19 03:00 README.md
4.0K -rw-r--r-- 1 root root  406 Jun 19 03:00 adapter_config.json
9.1M -rw-r--r-- 1 root root 9.1M Jun 19 03:00 adapter_model.bin
4.0K -rw-r--r-- 1 root root 3.9K Jun 19 03:00 training_args.bin


In [26]:
! ls -lash /model/trainer

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 9.1M
4.0K drwxr-xr-x 2 root root 4.0K Jun 19 03:04 .
8.0K drwxr-xr-x 1 root root 4.0K Jun 19 03:04 ..
4.0K -rw-r--r-- 1 root root   27 Jun 19 03:04 README.md
4.0K -rw-r--r-- 1 root root  406 Jun 19 03:04 adapter_config.json
9.1M -rw-r--r-- 1 root root 9.1M Jun 19 03:04 adapter_model.bin
4.0K -rw-r--r-- 1 root root 3.9K Jun 19 03:04 training_args.bin


In [27]:
! cat /model/trainer/adapter_config.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
{
  "base_model_name_or_path": "/model/saved/",
  "bias": "none",
  "fan_in_fan_out": false,
  "inference_mode": true,
  "init_lora_weights": true,
  "layers_pattern": null,
  "layers_to_transform": null,
  "lora_alpha": 32,
  "lora_dropout": 0.05,
  "modules_to_save": null,
  "peft_type": "LORA",
  "r": 8,
  "revision": null,
  "target_modules": [
    "query_key_value"
  ],
  "task_type": "CAUSAL_LM"
}

In [28]:
#text = ">>QUESTION<<\nWrite the YAML files to deploy a docker registry on K8s\n>>ANSWER<<\n"
#device = "cuda:0"

# inputs = tokenizer(text, return_tensors="pt").to(device)
# inputs.pop("token_type_ids")
# outputs = model.generate(**inputs, max_new_tokens=200, top_p=0.95, top_k=50, temperature=0.8)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.



Write the YAML files to deploy a docker registry on K8s

To deploy a docker registry on k8s, you can follow these steps:

1. Install the docker registry on the k8s cluster. You can use the official docker registry image or install it manually.

2. Create a Kubernetes deployment for the registry. You can use the following YAML file to create the deployment:

```
apiVersion: v1
kind: Deployment
metadata:
  name: registry
spec:
  containers:
  - name: registry
    image: docker.io/registry:3.1.1
    ports:
    - containerPort: 5000
  volumes:
  - name: registry-data
    path: /data/registry
```

This YAML file creates a deployment for the registry and maps the container port 5000 to the host port 5000.

3. Create a Kubernetes service for the registry. You can use the following YAML file to create the service:

```
apiVersion: v1
kind: Service
metadata:
  name: registry
spec:
  selector:
    app: registry
  ports:
  - name: registry
    port: 5000
  type: ClusterIP
```

This YAML file cre

In [1]:
! ls /model

logs  requirements.txt	sample-data  save  saved  scripts  src	trained
