In [2]:
from transformers import AutoModelForCausalLM
from peft import PeftModel

base_model = "ministral/Ministral-3b-instruct"

base = AutoModelForCausalLM.from_pretrained(base_model)
lora_model = PeftModel.from_pretrained(base, "./results")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
import torch 

device = "cuda" if torch.cuda.is_available() else "cpu"
lora_model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-13): 14 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear

In [6]:
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset("tatsu-lab/alpaca", split="train").shuffle(seed=42).select(range(2000))

def make_prompt(x):
    prompt = "<|im_start|>user\n"
    prompt += x["instruction"]

    if x["input"].strip():
        prompt += "\n" + x["input"]
    
    prompt += "\n<|im_end|>\n<|im_start|>assistant\n"
    return prompt

def tokenize_func(x):
    text_tokens = tokenizer(
        x["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )
    prompt = make_prompt(x)
    prompt_tokens = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors="pt"
    )
    return {
        "prompt": prompt,
        "text_input_ids": text_tokens["input_ids"],
        "text_attention_mask": text_tokens["attention_mask"],
        "prompt_input_ids": prompt_tokens["input_ids"],
        "prompt_attention_mask": prompt_tokens["attention_mask"]
    }

In [7]:
dataset = dataset.train_test_split(test_size=0.025, seed=42)
train = dataset['train']
val = dataset['test']

val = val.map(tokenize_func, batched=False)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [8]:
from torch.utils.data import DataLoader
import math 

lora_model.eval()
total_loss = 0.0
count = 0
val_loader = DataLoader(val, batch_size=1)

for batch in val_loader:
    print(count)
    input_ids = torch.tensor(batch['text_input_ids'], device=device)
    attention_mask = torch.tensor(batch['text_attention_mask'], device=device)

    with torch.no_grad():
        outputs = lora_model(input_ids, attention_mask=attention_mask, labels=input_ids)
    
    total_loss += outputs.loss.item()
    count += 1

ppl = math.exp(total_loss / count)
print(f"Perplexity: {ppl}")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
Perplexity: 12523.647017364376


In [11]:
for batch in val_loader:
    input_ids = torch.tensor(batch['prompt_input_ids'], device=device)
    attention_mask = torch.tensor(batch['prompt_attention_mask'], device=device)
    outputs = lora_model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=256,
        do_sample=True,
    )
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"PROMPT: {batch['prompt'][0]}", end = "\n\n")
    print(f"Model output: {decoded_output}", end = "\n\n")
    print(f"Expected output: {batch['output'][0]}", end = "\n\n")

    break 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


PROMPT: <|im_start|>user
Describe a process to deploy a web app using an Amazon EC2 instance.
<|im_end|>
<|im_start|>assistant


Model output: <|im_start|>user
Describe a process to deploy a web app using an Amazon EC2 instance.
<|im_end|>
<|im_start|>assistant

According to the instructions provided in the instructions provided, you will need to set up the S3 cluster. Next, the next step is to create an EC2 instance. Include your address using Amazon's pre-existing service account key and configure your environment variables. Finally, you can use the App Access token to access the application and make necessary changes. If you haven't done so, allow the applications to allow their users to install an App Access token, which you can access via your application source-source connection. Once you have allowed, proceed to launch the application, to enable any options, or let the app have your access token. Once the app is launched, you can access it using one of the other tools available.