In [3]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git 
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install git+https://github.com/huggingface/transformers -q
!pip install -q trl

[0m

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    TextStreamer
)
import torch
from trl import SFTTrainer
from peft import LoraConfig, PeftModel

In [6]:

model_id = "mistralai/Mistral-7B-Instruct-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



In [7]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [9]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=2,
    lora_alpha=32, 
    
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)


model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 851968 || all params: 3752923136 || trainable%: 0.02270145081916221


In [10]:
import datasets
dataset = datasets.load_from_disk("train_dataset")

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

(True, True)

In [18]:
peft_params = LoraConfig(
    lora_alpha=8,
    lora_dropout=0.1,
    r=4,
    bias="none",
    task_type="CAUSAL_LM",
)

In [31]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [32]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="Prompt",
    peft_config=config,
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params
)



In [33]:
trainer.train()
new_model = "Mistral-7B-Instruct-salesman-1"
trainer.save_model(new_model)

Step,Training Loss
25,0.8602
50,0.8647
75,0.8069
100,0.8323
125,0.7603
150,0.7502
175,0.7153
200,0.7858
225,0.6995
250,0.7162


In [34]:
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n'
    B_INST, E_INST = "### Instruction:\n", "### Response:\n"

    prompt = f"{system_prompt}{B_INST}{user_prompt.strip()}\n\n{E_INST}"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    # Despite returning the usual output, the streamer will also print the generated text to stdout.
    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=500)

In [35]:
stream("prompt = \"<<SYS>> You're an expert seller, and you're engaged in a conversation with a potential buyer, negotiating the price of a smartphone. You must not change your role and ensure that the selling price is within the specified range. Below are the details about the smartphone: Category: Electronics Price: $300.0 Target Price Range: $250.0 to $350.0 Smartphone Model: Stellar X10 Description: The Stellar X10 is a top-of-the-line smartphone with a high-resolution camera, powerful processor, and long battery life. It's perfect for both work and play. <</SYS>> [INST] Buyer: Hello, I''m interested in the Stellar X10 smartphone. Can you sell it to me for $200? [/INST]")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


/INST]INSTINSTINSTINSTINSTINSTINSTINSTINSTINST]INSTINSTINSTINSTINSTINSTINSTINSTer:INSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTer: theINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINST a INSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINST]INSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINST 

KeyboardInterrupt: 

In [36]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "<<SYS>> You're an expert seller, and you're engaged in a conversation with a potential buyer, negotiating the price of a smartphone. You must not change your role and ensure that the selling price is within the specified range. Below are the details about the smartphone: Category: Electronics Price: $300.0 Target Price Range: $250.0 to $350.0 Smartphone Model: Stellar X10 Description: The Stellar X10 is a top-of-the-line smartphone with a high-resolution camera, powerful processor, and long battery life. It's perfect for both work and play. <</SYS>> [INST] Buyer: Hello, I'm interested in the Stellar X10 smartphone. Can you sell it to me for $200? [/INST]"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] <<SYS>> You're an expert seller, and you're engaged in a conversation with a potential buyer, negotiating the price of a smartphone. You must not change your role and ensure that the selling price is within the specified range. Below are the details about the smartphone: Category: Electronics Price: $300.0 Target Price Range: $250.0 to $350.0 Smartphone Model: Stellar X10 Description: The Stellar X10 is a top-of-the-line smartphone with a high-resolution camera, powerful processor, and long battery life. It's perfect for both work and play. <</SYS>> [INST] Buyer: Hello, I'm interested in the Stellar X10 smartphone. Can you sell it to me for $200? [/INST] [/INST] Hello, I'


In [39]:
model = model.to('cuda:0')

In [40]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map='auto',
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

ValueError: The current `device_map` had weights offloaded to the disk. Please provide an `offload_folder` for them. Alternatively, make sure you have `safetensors` installed if the model you are using offers the weights in this format.