In [1]:
!pip install -q -U trl transformers accelerate peft einops
!pip install -q datasets bitsandbytes scipy

[0m

In [2]:
import torch
from transformers import pipeline, logging, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer
from datasets import load_dataset
import datasets
import pandas as pd
import scipy



In [3]:
## 1 - Loading Model
model_name = "microsoft/phi-2"
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
## 2 - Loading Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
## 3 - Print model to check which layers to use for LORA
print(model)

PhiForCausalLM(
  (transformer): PhiModel(
    (embd): Embedding(
      (wte): Embedding(51200, 2560)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (h): ModuleList(
      (0-31): 32 x ParallelBlock(
        (ln): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
        (mixer): MHA(
          (rotary_emb): RotaryEmbedding()
          (Wqkv): Linear4bit(in_features=2560, out_features=7680, bias=True)
          (out_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (inner_attn): SelfAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
          (inner_cross_attn): CrossAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
        )
        (mlp): MLP(
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
          (act): NewGELUActivation()
        )
      )

In [6]:
## 4 - LORA config

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha = lora_alpha,
    lora_dropout = lora_dropout,
    r = lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "Wqkv",
        "out_proj",
        "fc1",
        "fc2"
    ]
)

In [7]:
# 5 - Training arguments using HG

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 100
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 700
warmup_ratio = 0.05
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir = output_dir,
    per_device_train_batch_size = per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    optim = optim,
    save_steps = save_steps,
    logging_steps = logging_steps,
    learning_rate = learning_rate,
    fp16=True,
    max_grad_norm= max_grad_norm,
    max_steps = max_steps,
    warmup_ratio = warmup_ratio,
    lr_scheduler_type = lr_scheduler_type,
    #gradient_checkpointing=True,
)

In [8]:
# 6 - Load dataset
dataset_name = "OpenAssistant/oasst1"
dataset = load_dataset(dataset_name, split="train")
df_data = dataset.to_pandas()
df_data.head(5)

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels
0,6ab24d72-0181-4594-a9cd-deaf170242fb,,c3fe8c76-fc30-4fa7-b7f8-c492f5967d18,2023-02-05T14:23:50.983374+00:00,Can you write a short introduction about the r...,prompter,en,3,True,False,,False,,"{'toxicity': 0.00044308538781479, 'severe_toxi...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_ranking'...","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
1,c8e83833-ecbc-44fe-b6db-735228c25a1c,6ab24d72-0181-4594-a9cd-deaf170242fb,2c96e467-66f0-4be7-9693-bda51356a424,2023-02-06T13:50:44.657083+00:00,"""Monopsony"" refers to a market structure where...",assistant,en,3,True,False,0.0,False,,"{'toxicity': 0.00026396565954200923, 'severe_t...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [3...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
2,6708c47f-05c9-4346-b3d2-40b2bd24fde4,c8e83833-ecbc-44fe-b6db-735228c25a1c,2c96e467-66f0-4be7-9693-bda51356a424,2023-02-06T18:48:49.391686+00:00,Now explain it to a dog,prompter,en,3,True,False,,False,,"{'toxicity': 0.03648477792739868, 'severe_toxi...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,,"{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
3,343ee2d4-87ae-41fd-a768-bdd65959dc4a,6ab24d72-0181-4594-a9cd-deaf170242fb,49ddcb0d-6588-43bd-858d-19c577f12e7b,2023-02-06T13:37:56.044680+00:00,Monopsony is a market structure in which there...,assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.0008866374846547842, 'severe_to...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_labeling...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
4,18145bf4-37fd-4ac0-80f5-6108b5f2b365,343ee2d4-87ae-41fd-a768-bdd65959dc4a,e10e99a0-38ac-4b07-bf5d-4427696e4e0d,2023-02-06T18:52:51.428543+00:00,How can one fight back when a monospony had be...,prompter,en,3,True,False,,False,,"{'toxicity': 0.0009362137061543763, 'severe_to...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1'], 'count': [1]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."


In [9]:
# 7 - Create training data
assistances = df_data[(df_data.role=="assistant") & (df_data["rank"] == 0.0)]
#assistances = assistances[assistances.lang == "en"]
prompters = df_data[(df_data.role=="prompter")]
prompters = prompters.set_index("message_id")
# traverse all responses to get associated prompts
assemble_text_training = []
for _,row in assistances.iterrows():
  prompt_text = prompters.loc[row.parent_id,'text'] # get the prompt
  new_row = "### Human: " + prompt_text + "### Assistant: " + row['text']
  assemble_text_training.append(new_row)
assistances['prompt_response'] = assemble_text_training

# convert into HG dataset
oa_dataset = datasets.Dataset.from_pandas(assistances)
oa_dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assistances['prompt_response'] = assemble_text_training


Dataset({
    features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels', 'prompt_response', '__index_level_0__'],
    num_rows: 17972
})

In [10]:
# 7 - Trainer RL
max_seq_length = 512
trainer = SFTTrainer(
    model = model,
    train_dataset = oa_dataset,
    peft_config = peft_config,
    dataset_text_field="prompt_response",
    max_seq_length=max_seq_length,
    tokenizer = tokenizer,
    args = training_arguments
)

Map:   0%|          | 0/17972 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [11]:
# 8 - keep normalisation layer to 32 for stable training
for name,module in trainer.model.named_modules():
  if "norm" in name:
    module = module.to(torch.float32)

In [12]:
# 9 - train the model
trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,1.802
20,1.6981
30,1.6637
40,1.5537
50,1.5362
60,1.6402
70,1.6503
80,1.7586
90,1.6728
100,1.5074


Checkpoint destination directory ./results/checkpoint-100 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-200 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-300 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-400 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=700, training_loss=1.5953210966927664, metrics={'train_runtime': 2771.5165, 'train_samples_per_second': 4.041, 'train_steps_per_second': 0.253, 'total_flos': 8.391498926321664e+16, 'train_loss': 1.5953210966927664, 'epoch': 0.62})

In [13]:
# 10 - test the model
prompt = "What is large language model?"
pipe = pipeline(task="text-generation",model=model,tokenizer=tokenizer,max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
#print(result)
print(result[0]['generated_text'])



<s>[INST] What is large language model? [/INST]>

Large language models (LLMs) are a type of artificial intelligence (AI) model that can generate human-like text based on a given prompt. They are trained on large amounts of text data and can be used for a variety of applications, such as language translation, text summarization, and text generation.

In this chapter, we will explore the concept of large language models, their applications, and how to use them in Python.

## What is a large language model?

A large language model is a type of AI model that is trained on a large amount of text data. The model learns to predict the next word in a sentence based on the previous words, and it can generate text that is similar to the training data.

The size of the model is what makes it a "large" language model. These models are trained on massive amounts of text data, and they


In [None]:
prompt = "What is QLora that stands for Quantization and Low-Rank Adapters"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=2000)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])