#**Step 1: Install All the Required Packages**

In [2]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.36.0 trl==0.4.7

[0m

#**Step 2: Import All the Required Libraries**

In [3]:
!pip install pyarrow

[0m

In [4]:
!pip install safetensors==0.4.1

Collecting safetensors==0.4.1
  Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: safetensors
  Attempting uninstall: safetensors
    Found existing installation: safetensors 0.4.2
    Uninstalling safetensors-0.4.2:
      Successfully uninstalled safetensors-0.4.2
Successfully installed safetensors-0.4.1
[0m

In [5]:
import os
import torch
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [6]:
!pip install bitsandbytes

[0m

In [7]:
# The model that you want to train from the Hugging Face hub
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# The instruction dataset to use
dataset_path = 'train.csv'

# Fine-tuned model name
new_model = "mistral-cambridge-cefr-part1-v2"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 5

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = True

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 50

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# device_map = {"": 2}


1. First of all, we want to load the dataset we defined. Here, our dataset is already preprocessed but, usually, this is where you would reformat the prompt, filter out bad text, combine multiple datasets, etc.


2. Then, we’re configuring bitsandbytes for 4-bit quantization.


3. Next, we're loading the Llama 2 model in 4-bit precision on a GPU with the corresponding tokenizer.


4. Finally, we're loading configurations for QLoRA, regular training parameters, and passing everything to the SFTTrainer. The training can finally start!

In [8]:
from trl.trainer import SFTTrainer
from peft import LoraConfig

In [9]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files=dataset_path, split='train')

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Generating train split: 0 examples [00:00, ? examples/s]

Your GPU supports bfloat16: accelerate training with bf16=True


In [10]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config
    device_map = "auto"
)
model.config.use_cache = False
model.config.pretraining_tp = 1
# model.to(device)

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [11]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [12]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]

)

In [13]:

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type
    # report_to="tensorboard"
    )

# Verify the current device
print(torch.cuda.current_device())

0


In [14]:
# Set supervised fine-tuning parameters

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()



Map:   0%|          | 0/1071 [00:00<?, ? examples/s]

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
50,1.3102
100,0.5052
150,0.3831
200,0.3334
250,0.2827
300,0.2479
350,0.2108
400,0.2028
450,0.1686
500,0.1617


TrainOutput(global_step=1340, training_loss=0.17727012883371382, metrics={'train_runtime': 1445.6412, 'train_samples_per_second': 3.704, 'train_steps_per_second': 0.927, 'total_flos': 1.1795449283341517e+17, 'train_loss': 0.17727012883371382, 'epoch': 5.0})

In [15]:
# Save trained model
trainer.model.save_pretrained(new_model)

In [22]:
text = """

"Role:
- You are an assessor of the Cambridge B2 First English Assessment. You are an expert in this with several years of experience.
- You will be given a conversation between an Examiner and a Candidate and your task is to give scores for two metrics for the responses given by the "Candiate" in the conversation.

Evaluation Steps:

- Read the conversation between the Examiner and the Candidate carefully.
- Assign a score for GRAMMAR_AND_VOCABULARY and  DISCOURSE_MANAGEMENT on a scale of 1 to 5, where 1 is the lowest and 5 is the highest.
- Please disregard the response provided by "Examiner" in your evaluation.
- Present the evaluation categories and scores in JSON format and name it OUTPUT and the OUTPUT will have three key value pairs. GRAMMAR_AND_VOCABULARY and DISCOURSE_MANAGEMENT.

Band Descriptions:

GRAMMAR AND VOCABULARY:
Band 5: Demonstrates proficient control of a variety of simple and some complex grammatical structures. Utilizes an extensive range of appropriate vocabulary to express and exchange opinions on diverse familiar subjects.
Band 3: Displays adept control of simple grammatical forms, with attempts at some complex structures.Employs a suitable range of vocabulary to articulate views on various familiar topics.
Band 1: Exhibits adept control of simple grammatical forms. Utilizes a suitable range of vocabulary when discussing everyday scenarios.


DISCOURSE MANAGEMENT:
Band 5: Generates extensive discourse with minimal hesitation. Contributions are pertinent, organized coherently, and incorporate a variety of cohesive devices and discourse markers.
Band 3: Generates prolonged discourse despite occasional hesitation. Contributions remain pertinent, with minimal repetition, and integrate various cohesive devices.
Band 1: Generates responses extending beyond short phrases despite occasional hesitation. Contributions are predominantly relevant, with some repetition, and utilize basic cohesive devices.


FOR ALL "GRAMMAR AND VOCABULARY" & "DISCOURSE MANAGEMENT":
Band 4: Performance characteristics overlap between Bands 3 and 5.
Band 2: Performance characteristics are shared between Bands 1 and 3.

____________
"Conversation:

[{"Examiner": "Meera, how do you prefer to relax after work?", "Candidate": "I read books."}, {"Examiner": "Do you have any interest in music?", "Candidate": "Yes I love music"}, {"Examiner": "Can you describe the Diwali festivities?", "Candidate": "We light lamps and enjoy"}, {"Examiner": "How frequently do you browse social media?", "Candidate": "Quite often"}]""

"""

In [20]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, temperature=0, max_length=1000)

In [None]:
# Ignore warnings
#logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = text
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!huggingface-cli login

In [None]:
!huggingface-cli logout

In [None]:
!huggingface-cli login

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
!pip install ipywidgets

In [None]:
from huggingface_hub import notebook_login
notebook_login()