In [25]:
import json
import random
import torch
import time
import logging
import pandas as pd
from peft import LoraConfig
from trl import SFTTrainer
import transformers
import datasets
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from datasets import Dataset, DatasetDict, load_from_disk

In [26]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
data_path = f"./data/"

In [34]:

logger = logging.getLogger(__name__)

###################
# Hyper-parameters
###################
training_config = {
    "bf16": True,
    "do_eval": False,
    "learning_rate": 5.0e-05,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 5,
    "max_steps": -1,
    "output_dir": "./phi3_results",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 1,
    "per_device_train_batch_size": 1,
    "remove_unused_columns": True,
    "save_steps": 100,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
    "logging_dir": "./logs",
    "logging_strategy":"steps",
    "logging_steps":100,
    "save_strategy":"steps",
    "save_steps":100,
    "save_total_limit":2,  # Limit the total number of checkpoints
    "evaluation_strategy":"steps",
    "eval_steps":100,
    "load_best_model_at_end":True, # Load the best model at the end of training
    
    }

peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type":"CAUSAL_LM",
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
}
train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)


In [35]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [36]:
################
# Modle Loading
################
checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
# checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"

model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map=None
)

model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, add_eos_token=True, trust_remote_code = True)
tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

print(f"Memory footprint: {model.get_memory_footprint() / 1e9} GB")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Memory footprint: 7.642159104 GB


In [42]:
def create_prompt(x):
    question = x['question']
    options = [f"{i}. {x[f'option {i}']}" for i in range(1, 6) if f'option {i}' in x]
    options_str = "\n".join(options)
        
    prompt = f"""
Category: {x['category']}
Question:
{question}
Options:
{options_str}
[INST] Answer this post Telecommunication Nultiple Choice Question and provide the correct option. [/INST]
Answer: {x['answer']} 
Explanation: {x['explanation']}
"""
    return prompt


In [47]:
def create_hf_dataset():
    with open(f"TeleQnA_training.txt", 'r') as f:
        content = f.read()
        
    dataset = json.loads(content)
    
    # process data 
    processed_data = [create_prompt(example) for example in dataset.values()]
    
    # Split the dataset into train and test sets
    random.shuffle(processed_data)
    train_size = int(0.85 * len(processed_data))  # 85% for training, 15% for testing
    train_data = processed_data[:train_size]
    test_data = processed_data[train_size:]

    # Create a Hugging Face Dataset for train and test sets
    train_dataset = Dataset.from_dict({
        "prompt": [prompt for prompt in train_data]
    })

    test_dataset = Dataset.from_dict({
        "prompt": [prompt for prompt in test_data]
    })

    # Optionally, create a DatasetDict if you have train/val/test splits
    hf_dataset = DatasetDict({
        "train": train_dataset,
        "test": test_dataset
    })

    try:
    # Save the dataset to disk
        hf_dataset.save_to_disk(f"{data_path}hf_dataset")
        print("Dataset saved successfully.")
    except Exception as e:
        print(f"Error occurred while saving the dataset: {e}")
    
    

In [48]:
create_hf_dataset()

Saving the dataset (0/1 shards):   0%|          | 0/1241 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/220 [00:00<?, ? examples/s]

Dataset saved successfully.


In [50]:
ds = load_from_disk(f"{data_path}hf_dataset/")
print(f"Prompt:{ds['train']['prompt'][:5]}")

Prompt:['\nCategory: Standards specifications\nQuestion:\nWhat is the purpose of a codeword in the context of Target UE privacy? [3GPP Release 17]\nOptions:\n1. To determine which Requestors are allowed to request location information\n2. To ensure that only the intended requestor or LCS client can use the codeword\n3. To secure the codeword from being misused\n4. All of the above\n5. None of the above\n[INST] Answer this post Telecommunication Nultiple Choice Question and provide the correct option. [/INST]\nAnswer: option 4: All of the above \nExplanation: The codeword in the Target UE privacy settings is used to determine access privileges, secure the codeword, and prevent misuse.\n', '\nCategory: Standards specifications\nQuestion:\nWhat is the purpose of the SGNB ADDITION REQUEST message? [3GPP Release 17]\nOptions:\n1. To stop the data transmission for the UE\n2. To create a UE context\n3. To release the UE context\n4. To resume the SCG of the UE\n5. To setup bearer context\n[INS

### Fine tuning 

In [51]:
print_trainable_parameters(model)

trainable params: 3821079552 || all params: 3821079552 || trainable%: 100.00


In [52]:
print(model)

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out_features=3206

**Tokenize and process the HF data**

In [55]:
def tokenize_dataset(example, tokenizer):
    enc = tokenizer(example["prompt"], return_tensors="pt", padding="max_length", truncation=True, max_length=2048)
    enc["input_data"] = enc["input_ids"]
    return enc

In [56]:
train_data = ds["train"].map(tokenize_dataset, fn_kwargs={"tokenizer": tokenizer}, num_proc=5, remove_columns=["prompt"], batched=True)
test_data = ds["test"].map(tokenize_dataset, fn_kwargs={"tokenizer": tokenizer}, num_proc=5, remove_columns=["prompt"], batched=True)

Map (num_proc=5):   0%|          | 0/1241 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/220 [00:00<?, ? examples/s]

**Trainer**

In [60]:
len(train_data[0]['input_ids'])

2048

In [57]:
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    peft_config=peft_conf,
    train_dataset=train_data,
    eval_dataset=test_data,
    max_seq_length=2048,
    dataset_text_field = "input_data",
    tokenizer=tokenizer
)

start_time = time.time()
trainer.train()
end_time = time.time()
print(f"Training completed in {end_time - start_time} seconds.")
print("=="*50)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

OutOfMemoryError: CUDA out of memory. Tried to allocate 54.00 MiB. GPU 