In [1]:

!pip install -q accelerate peft bitsandbytes transformers trl nlpaug

In [2]:

import os
import torch
from datasets import Dataset, DatasetDict, concatenate_datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    TrainerCallback
)
from peft import LoraConfig
from trl import SFTTrainer
import pandas as pd
import nlpaug.augmenter.word as naw
import gc
from torch.nn import DataParallel

2024-08-17 10:13:26.017351: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-17 10:13:26.017411: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-17 10:13:26.020484: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:

# Define the LogMetricsCallback class
class LogMetricsCallback(TrainerCallback):
    def __init__(self, output_file):
        self.output_file = output_file

        # Ensure the directory exists
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

        # Write the header to the file
        with open(self.output_file, "w") as f:
            f.write("step,epoch,train_loss,eval_loss,learning_rate\n")

    def on_log(self, args, state, control, **kwargs):
        # Get the current metrics
        step = state.global_step
        epoch = state.epoch
        train_loss = state.log_history[-1].get("loss", "N/A")
        eval_loss = state.log_history[-1].get("eval_loss", "N/A")
        learning_rate = state.log_history[-1].get("learning_rate", "N/A")

        # Append metrics to the file
        with open(self.output_file, "a") as f:
            f.write(f"{step},{epoch},{train_loss},{eval_loss},{learning_rate}\n")





In [None]:
# Constants and configurations
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
NEW_MODEL = "llama-2-7b-user-manuals-new"
MAX_SEQ_LENGTH = 128
OUTPUT_DIR = "./results"


# LoRA parameters
LORA_R = 16
LORA_ALPHA = 64
LORA_DROPOUT = 0.3

# Training arguments
TRAIN_ARGS = {
    "output_dir": OUTPUT_DIR,
    "num_train_epochs": 3,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 1,
    "gradient_checkpointing": True,
    "max_grad_norm": 0.3,
    "learning_rate": 2e-4,
    "weight_decay": 0.1,
    "optim": "paged_adamw_32bit",
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.03,
    "group_by_length": True,
    "logging_steps": 16,
    "save_steps" : 32,
    "eval_steps" : 32,
    "evaluation_strategy": "steps",
    "save_strategy": "steps",
    "load_best_model_at_end": True,
    "report_to": "tensorboard",
}

# Bits and Bytes configuration
BNB_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Clear CUDA cache
gc.collect()
torch.cuda.empty_cache()

# Setup model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=BNB_CONFIG,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Setup LoRA configuration
peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
)

# Prepare dataset
df = pd.read_excel('/kaggle/input/dataset/Dataset_creation.xlsx')
df["formatted_instruction"] = df.apply(lambda x: f"<s>[INST] {x['Instructions']} [/INST] {x['Responses']} </s>", axis=1)
df = df.drop(columns=['Instructions', 'Responses'])
# Shuffle the dataframe
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# Split the data into train and validation
train_df = df.sample(frac=0.8, random_state=42,)
val_df = df.drop(train_df.index)

# Convert the training data into a Dataset
train_dataset = Dataset.from_pandas(train_df,)

# Define your augmenters
contextual_augmenter = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="substitute")
keyboard_error_augmenter = naw.SpellingAug()

def contextual_augmentation(examples):
    augmented_texts = []
    for text in examples['formatted_instruction']:
        inst_start = text.find("[INST]") + 6
        inst_end = text.find("[/INST]")
        if inst_start != -1 and inst_end != -1:
            instruction = text[inst_start:inst_end].strip()
            augmented_instruction = contextual_augmenter.augment(instruction)[0]
            augmented_text = text[:inst_start] + augmented_instruction + text[inst_end:]
            augmented_texts.append(augmented_text)
        else:
            augmented_texts.append(text)
    return {'formatted_instruction': augmented_texts}

def keyboard_error_augmentation(examples):
    augmented_texts = []
    for text in examples['formatted_instruction']:
        inst_start = text.find("[INST]") + 6
        inst_end = text.find("[/INST]")
        if inst_start != -1 and inst_end != -1:
            instruction = text[inst_start:inst_end].strip()
            augmented_instructions = keyboard_error_augmenter.augment(instruction)
            # If the augmentation returns a list, take the first item
            augmented_instruction = augmented_instructions[0] if augmented_instructions else instruction
            augmented_text = text[:inst_start] + augmented_instruction + text[inst_end:]
            augmented_texts.append(augmented_text)
        else:
            augmented_texts.append(text)
    return {'formatted_instruction': augmented_texts}

# Perform contextual word embedding augmentation
contextual_augmented_dataset = train_dataset.map(
    contextual_augmentation,
    batched=True,
    remove_columns=train_dataset.column_names
)

# Perform keyboard error augmentation
keyboard_error_augmented_dataset = train_dataset.map(
    keyboard_error_augmentation,
    batched=True,
    remove_columns=train_dataset.column_names
)

# Concatenate the original, contextual augmented, and keyboard error augmented datasets
combined_train_dataset = concatenate_datasets([
    train_dataset,
    contextual_augmented_dataset,
    keyboard_error_augmented_dataset
])

# Shuffle the final combined dataset
combined_train_dataset = combined_train_dataset.shuffle(seed=42)

# Convert the validation data into a Dataset (no augmentation)
val_dataset = Dataset.from_pandas(val_df)

# Setup training arguments
training_args = TrainingArguments(**TRAIN_ARGS)

# Initialize the custom callback
log_callback = LogMetricsCallback(output_file="./logs/training_metrics.csv")

# Initialize trainer with the custom callback
trainer = SFTTrainer(
    model=model,
    train_dataset=combined_train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    dataset_text_field="formatted_instruction",
    max_seq_length=MAX_SEQ_LENGTH,
    tokenizer=tokenizer,
    args=training_args,
    callbacks=[log_callback],
)

# Train the model
trainer.train()

# Save the model
trainer.model.save_pretrained(NEW_MODEL)

print("Training completed.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/330 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
32,2.6922,2.315817
64,2.3699,2.1753
96,1.7898,2.143715



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-hf is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-hf is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-hf is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Llama-2-7b-hf.


In [None]:
import gc
gc.collect()


In [None]:
# Run text generation pipeline with our next model
prompt = "What is the info about Gateway in IP configuration with respect to laser PLC access?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

In [None]:
import shutil

src_path = r"/kaggle/input/llama2"
dst_path = r"/kaggle/working/llama2"

shutil.copytree(src_path, dst_path)

In [None]:
import logging
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

# Load the adapter model
adapter_model_id = "/kaggle/working/llama2"
model = PeftModel.from_pretrained(base_model, adapter_model_id, is_trainable=False)

# Set logging level to ignore warnings
logging.getLogger().setLevel(logging.CRITICAL)

# Define the prompt
prompt = "What is the info about Gateway in IP configuration?"

# Create the text generation pipeline
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)

# Generate text
result = pipe(f"<s>[INST] {prompt} [/INST]")

# Print the generated text
print(result[0]['generated_text'])

In [None]:
# '
# df = pd.read_excel('/kaggle/input/dataset/Dataset_creation.xlsx')
# df["formatted_instruction"] = df.apply(lambda x: f"<s>[INST] {x['Instructions']} [/INST] {x['Responses']} </s>", axis=1)
# #df = df.drop(columns=df.columns[:2])
# df = df.drop(columns=['Instructions', 'Responses'])
# df.to_csv("conversational_genAI.csv", index=False)
# #created_dataset = Dataset.from_pandas(df)
# #print(created_dataset)


# df_read = pd.read_csv("/kaggle/working/conversational_genAI.csv")

# # Ensure the key 'formatted_instruction' exists in the DataFrame
# if "formatted_instruction" not in df_read.columns:
#     raise ValueError("The key 'formatted_instruction' does not exist in the DataFrame")

# # Convert DataFrame to Hugging Face Dataset
# dataset = Dataset.from_pandas(df_read)

# train_size = 0.8
# train_dataset = df_read.sample(frac=train_size, random_state=42)
# val_dataset = df_read.drop(train_dataset.index)
 
# train_dataset = Dataset.from_pandas(train_dataset)
# val_dataset = Dataset.from_pandas(val_dataset)

# # Create a DatasetDict with the train split
# dataset_dict = DatasetDict({
#     "train": train_dataset,
#     "validation" : val_dataset
# })
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# # model name
# model_name = "meta-llama/Llama-2-7b-hf"
# new_model = "llama-2-7b-user-manuals-new"

# max_seq_length = 128                    # Maximum sequence length to use
# packing = False                        # Pack multiple short examples in the same input sequence to increase efficiency
# device_map = {"": 0}                   # Load the entire model on the GPU 0


# # Tokenize the dataset (example tokenization step)
# def tokenize_function(examples):
#     return tokenizer(examples["formatted_instruction"], padding="max_length", truncation=True, max_length=max_seq_length)
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,device_map = 'auto')
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
# tokenized_dataset = dataset_dict.map(tokenize_function, batched=True)

# #dataset_name = "/content/conversational_genAI.csv"
# #def tokenize(example):
# #    return tokenizer(example['formatted_instruction'], truncation=True, padding='max_length',max_length=max_seq_length)


# #tokenized_dataset = created_dataset.map(tokenize, batched=True, remove_columns=created_dataset.column_names)
# #print(tokenized_dataset[0])'

In [None]:
# import pandas as pd
# from datasets import Dataset, DatasetDict
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import random
# import torch

# # Load the model and tokenizer
# model_name = "meta-llama/Llama-2-7b-hf"
# new_model = "llama-2-7b-user-manuals-new"
# max_seq_length = 128
# packing = False
# device_map = {"": 0}

# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, device_map='auto')
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

# model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map)

# def paraphrase_word(sentence, word_to_paraphrase, model, tokenizer):
#     masked_sentence = sentence.replace(word_to_paraphrase, "[MASK]")
#     inputs = tokenizer(masked_sentence, return_tensors="pt").to(model.device)
    
#     with torch.no_grad():
#         outputs = model(**inputs)
    
#     mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
#     predicted_token_id = outputs.logits[0, mask_token_index].argmax(axis=-1)
#     predicted_word = tokenizer.decode(predicted_token_id)
    
#     return predicted_word.strip()

# def augment_question(question):
#     # Extract the instruction part
#     inst_start = question.find("[INST]") + 6
#     inst_end = question.find("[/INST]")
#     instruction = question[inst_start:inst_end].strip()
    
#     words = instruction.split()
#     if len(words) < 3:  # Skip very short instructions
#         return question
    
#     word_to_paraphrase = random.choice(words)
#     paraphrased_word = paraphrase_word(instruction, word_to_paraphrase, model, tokenizer)
    
#     augmented_instruction = instruction.replace(word_to_paraphrase, paraphrased_word, 1)
    
#     # Reconstruct the full question with augmented instruction
#     return question[:inst_start] + augmented_instruction + question[inst_end:]

# def tokenize_and_augment(examples):
#     augmented_instructions = [augment_question(instruction) for instruction in examples["formatted_instruction"]]
#     return tokenizer(augmented_instructions, padding="max_length", truncation=True, max_length=max_seq_length)

# # Load and preprocess the data
# df = pd.read_excel('/kaggle/input/dataset/Dataset_creation.xlsx')
# df["formatted_instruction"] = df.apply(lambda x: f"<s>[INST] {x['Instructions']} [/INST] {x['Responses']} </s>", axis=1)
# df = df.drop(columns=['Instructions', 'Responses'])
# df.to_csv("conversational_genAI.csv", index=False)

# df_read = pd.read_csv("/kaggle/working/conversational_genAI.csv")

# # Split the data
# train_size = 0.8
# train_df = df_read.sample(frac=train_size, random_state=42)
# val_df = df_read.drop(train_df.index)

# # Convert to Hugging Face Datasets
# train_dataset = Dataset.from_pandas(train_df)
# val_dataset = Dataset.from_pandas(val_df)

# # Create a DatasetDict
# dataset_dict = DatasetDict({
#     "train": train_dataset,
#     "validation": val_dataset
# })

# # Apply tokenization and augmentation
# tokenized_dataset = dataset_dict.map(tokenize_and_augment, batched=True, remove_columns=dataset_dict["train"].column_names)

# # The tokenized_dataset is now ready for use in your training pipeline

In [None]:
pip install -q nlpaug

**Below code is working for data augmentatio using nlpaug library**
****

In [None]:
import nlpaug.augmenter.word as naw
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
import torch
import gc
import pandas as pd
from sklearn.model_selection import KFold

# Constants and configurations
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
NEW_MODEL = "llama-2-7b-user-manuals-new"
MAX_SEQ_LENGTH = 128
OUTPUT_DIR = "./results"
N_SPLITS = 5  # Number of folds for K-Fold Cross-Validation

# LoRA parameters
LORA_R = 16
LORA_ALPHA = 64
LORA_DROPOUT = 0.1 #0.5

# Training arguments
TRAIN_ARGS = {
    "output_dir": OUTPUT_DIR,
    "num_train_epochs": 1,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 1,
    "gradient_checkpointing": True,
    "max_grad_norm": 0.3,
    "learning_rate": 2e-4,
    "weight_decay": 0.01,
    "optim": "paged_adamw_32bit",
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.03,
    "group_by_length": True,
    "logging_steps": 5,
    "evaluation_strategy": "epoch",
    "save_strategy": "epoch",
    "load_best_model_at_end": True,
    "report_to": "tensorboard",
}

# Bits and Bytes configuration
BNB_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Clear CUDA cache
gc.collect()
torch.cuda.empty_cache()

# Setup model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=BNB_CONFIG,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load dataset
df = pd.read_excel('/kaggle/input/dataset/Dataset_creation.xlsx')
df["formatted_instruction"] = df.apply(lambda x: f"<s>[INST] {x['Instructions']} [/INST] {x['Responses']} </s>", axis=1)
df = df.drop(columns=['Instructions', 'Responses'])

# Initialize NLPAug augmenter
augmenter = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="substitute")

def augment_instruction(instruction):
    augmented = augmenter.augment(instruction)
    if isinstance(augmented, list):
        augmented = " ".join(augmented)
    return augmented

def augment_question(question):
    inst_start = question.find("[INST]") + 6
    inst_end = question.find("[/INST]")
    instruction = question[inst_start:inst_end].strip()
    
    augmented_instruction = augment_instruction(instruction)
    
    return question[:inst_start] + augmented_instruction + question[inst_end:]

def tokenize_and_augment(examples):
    augmented_instructions = [augment_question(instruction) for instruction in examples["formatted_instruction"]]
    return tokenizer(augmented_instructions, padding="max_length", truncation=True, max_length=MAX_SEQ_LENGTH)

# Setup K-Fold Cross-Validation
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

for fold, (train_index, val_index) in enumerate(kf.split(df)):
    print(f"Starting fold {fold + 1}/{N_SPLITS}")

    # Split the data into train and validation
    train_df = df.iloc[train_index]
    val_df = df.iloc[val_index]

    # Augment training data
    train_dataset = Dataset.from_pandas(train_df)
    tokenized_train_dataset = train_dataset.map(
        tokenize_and_augment,
        batched=True,
        remove_columns=train_dataset.column_names
    )
    
    # No augmentation for validation data
    val_dataset = Dataset.from_pandas(val_df)
    tokenized_val_dataset = val_dataset.map(
        lambda examples: tokenizer(examples["formatted_instruction"], padding="max_length", truncation=True, max_length=MAX_SEQ_LENGTH),
        batched=True,
        remove_columns=val_dataset.column_names
    )

    # Create DatasetDict for this fold
    dataset_dict = DatasetDict({
        "train": tokenized_train_dataset,
        "validation": tokenized_val_dataset
    })

    # Setup LoRA configuration
    peft_config = LoraConfig(
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        r=LORA_R,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # Setup training arguments
    training_args = TrainingArguments(**TRAIN_ARGS)

    # Initialize trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset_dict["train"],
        eval_dataset=dataset_dict["validation"],
        peft_config=peft_config,
        dataset_text_field="formatted_instruction",
        max_seq_length=MAX_SEQ_LENGTH,
        tokenizer=tokenizer,
        args=training_args,
    )

    # Train the model
    trainer.train()

    # Optionally, save model after each fold
    fold_model_dir = f"{NEW_MODEL}_fold_{fold + 1}"
    trainer.model.save_pretrained(fold_model_dir)

    print(f"Finished fold {fold + 1}/{N_SPLITS}")

# Optionally, aggregate results across folds


 **below code is the same script as above in addition to printing the first 5 examples of before and after augmentation******

In [None]:
import nlpaug.augmenter.word as naw
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
import torch
import gc
import pandas as pd

# Constants and configurations
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
NEW_MODEL = "llama-2-7b-user-manuals-new"
MAX_SEQ_LENGTH = 128
OUTPUT_DIR = "./results"

# LoRA parameters
LORA_R = 16
LORA_ALPHA = 64
LORA_DROPOUT = 0.1

# Training arguments
TRAIN_ARGS = {
    "output_dir": OUTPUT_DIR,
    "num_train_epochs": 1,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 1,
    "gradient_checkpointing": True,
    "max_grad_norm": 0.3,
    "learning_rate": 2e-4,
    "weight_decay": 0.01,
    "optim": "paged_adamw_32bit",
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.03,
    "group_by_length": True,
    "logging_steps": 5,
    "evaluation_strategy": "epoch",
    "save_strategy": "epoch",
    "load_best_model_at_end": True,
    "report_to": "tensorboard",
}

# Bits and Bytes configuration
BNB_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Clear CUDA cache
gc.collect()
torch.cuda.empty_cache()

# Setup model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=BNB_CONFIG,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Prepare dataset
df = pd.read_excel('/kaggle/input/dataset/Dataset_creation.xlsx')
df["formatted_instruction"] = df.apply(lambda x: f"<s>[INST] {x['Instructions']} [/INST] {x['Responses']} </s>", axis=1)
df = df.drop(columns=['Instructions', 'Responses'])

train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)

dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df)
})

# Initialize NLPAug augmenter
augmenter = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="substitute")

def augment_instruction(instruction):
    # NLPAug returns a list, so join it into a single string
    augmented = augmenter.augment(instruction)
    if isinstance(augmented, list):
        augmented = " ".join(augmented)  # Join the list into a single string
    return augmented

def augment_question(question):
    # Extract the instruction part
    inst_start = question.find("[INST]") + 6
    inst_end = question.find("[/INST]")
    instruction = question[inst_start:inst_end].strip()
    
    augmented_instruction = augment_instruction(instruction)
    
    # Reconstruct the full question with augmented instruction
    return question[:inst_start] + augmented_instruction + question[inst_end:]

def tokenize_and_augment(examples):
    # Print original instructions
    print("Original instructions:")
    print(examples["formatted_instruction"][:5])  # Print first 5 examples for brevity
    
    # Augment instructions
    augmented_instructions = [augment_question(instruction) for instruction in examples["formatted_instruction"]]
    
    # Print augmented instructions
    print("Augmented instructions:")
    print(augmented_instructions[:5])  # Print first 5 examples for brevity
    
    # Tokenize augmented instructions
    return tokenizer(augmented_instructions, padding="max_length", truncation=True, max_length=MAX_SEQ_LENGTH)

# Tokenize and augment dataset
tokenized_dataset = dataset_dict.map(
    tokenize_and_augment,
    batched=True,
    remove_columns=dataset_dict["train"].column_names
)

# Setup LoRA configuration
peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
)

# Setup training arguments
training_args = TrainingArguments(**TRAIN_ARGS)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    peft_config=peft_config,
    dataset_text_field="formatted_instruction",
    max_seq_length=MAX_SEQ_LENGTH,
    tokenizer=tokenizer,
    args=training_args,
)

# Train the model
trainer.train()

# Save the model
trainer.model.save_pretrained(NEW_MODEL)


Below code is with loss function and their .logfiles

In [None]:
import nlpaug.augmenter.word as naw
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig, TrainerCallback
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
import torch
import gc
import pandas as pd
import os

# Define the LogMetricsCallback class
class LogMetricsCallback(TrainerCallback):
    def __init__(self, output_file):
        self.output_file = output_file

        # Ensure the directory exists
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

        # Write the header to the file
        with open(self.output_file, "w") as f:
            f.write("step,epoch,train_loss,eval_loss,learning_rate\n")

    def on_log(self, args, state, control, **kwargs):
        # Get the current metrics
        step = state.global_step
        epoch = state.epoch
        train_loss = state.log_history[-1].get("loss", "N/A")
        eval_loss = state.log_history[-1].get("eval_loss", "N/A")
        learning_rate = state.log_history[-1].get("learning_rate", "N/A")

        # Append metrics to the file
        with open(self.output_file, "a") as f:
            f.write(f"{step},{epoch},{train_loss},{eval_loss},{learning_rate}\n")


# Constants and configurations
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
NEW_MODEL = "llama-2-7b-user-manuals-new"
MAX_SEQ_LENGTH = 128
OUTPUT_DIR = "./results"

# LoRA parameters
LORA_R = 16
LORA_ALPHA = 64
LORA_DROPOUT = 0.1

# Training arguments
TRAIN_ARGS = {
    "output_dir": OUTPUT_DIR,
    "num_train_epochs": 1,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 1,
    "gradient_checkpointing": True,
    "max_grad_norm": 0.3,
    "learning_rate": 2e-4,
    "weight_decay": 0.01,
    "optim": "paged_adamw_32bit",
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.03,
    "group_by_length": True,
    "logging_steps": 5,
    "evaluation_strategy": "steps",
    "save_strategy": "steps",
    "load_best_model_at_end": True,
    "report_to": "tensorboard",
}

# Bits and Bytes configuration
BNB_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Clear CUDA cache
gc.collect()
torch.cuda.empty_cache()

# Setup model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=BNB_CONFIG,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Prepare dataset
df = pd.read_excel('/kaggle/input/dataset/Dataset_creation.xlsx')
df["formatted_instruction"] = df.apply(lambda x: f"<s>[INST] {x['Instructions']} [/INST] {x['Responses']} </s>", axis=1)
df = df.drop(columns=['Instructions', 'Responses'])

train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)

dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df)
})

# Initialize NLPAug augmenter
augmenter = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="substitute")

def augment_instruction(instruction):
    # NLPAug returns a list, so join it into a single string
    augmented = augmenter.augment(instruction)
    if isinstance(augmented, list):
        augmented = " ".join(augmented)  # Join the list into a single string
    return augmented

def augment_question(question):
    # Extract the instruction part
    inst_start = question.find("[INST]") + 6
    inst_end = question.find("[/INST]")
    instruction = question[inst_start:inst_end].strip()
    
    augmented_instruction = augment_instruction(instruction)
    
    # Reconstruct the full question with augmented instruction
    return question[:inst_start] + augmented_instruction + question[inst_end:]

def tokenize_and_augment(examples):
    # Print original instructions
    print("Original instructions:")
    print(examples["formatted_instruction"][:5])  # Print first 5 examples for brevity
    
    # Augment instructions
    augmented_instructions = [augment_question(instruction) for instruction in examples["formatted_instruction"]]
    
    # Print augmented instructions
    print("Augmented instructions:")
    print(augmented_instructions[:5])  # Print first 5 examples for brevity
    
    # Tokenize augmented instructions
    return tokenizer(augmented_instructions, padding="max_length", truncation=True, max_length=MAX_SEQ_LENGTH)

# Tokenize and augment dataset
tokenized_dataset = dataset_dict.map(
    tokenize_and_augment,
    batched=True,
    remove_columns=dataset_dict["train"].column_names
)

# Setup LoRA configuration
peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
)

# Setup training arguments
training_args = TrainingArguments(**TRAIN_ARGS)

# Initialize the custom callback
log_callback = LogMetricsCallback(output_file="./logs/training_metrics.csv")

# Initialize trainer with the custom callback
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    peft_config=peft_config,
    dataset_text_field="formatted_instruction",
    max_seq_length=MAX_SEQ_LENGTH,
    tokenizer=tokenizer,
    args=training_args,
    callbacks=[log_callback],  # Add the custom callback here
)

# Train the model
trainer.train()

# Save the model
trainer.model.save_pretrained(NEW_MODEL)


In [None]:
pip install -q nlpaug

In [None]:
import nlpaug.augmenter.word as naw
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig, TrainerCallback
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
import torch
import gc
import pandas as pd
import os

# Define the LogMetricsCallback class
class LogMetricsCallback(TrainerCallback):
    def __init__(self, output_file):
        self.output_file = output_file

        # Ensure the directory exists
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

        # Write the header to the file
        with open(self.output_file, "w") as f:
            f.write("step,epoch,train_loss,eval_loss,learning_rate\n")

    def on_log(self, args, state, control, **kwargs):
        # Get the current metrics
        step = state.global_step
        epoch = state.epoch
        train_loss = state.log_history[-1].get("loss", "N/A")
        eval_loss = state.log_history[-1].get("eval_loss", "N/A")
        learning_rate = state.log_history[-1].get("learning_rate", "N/A")

        # Append metrics to the file
        with open(self.output_file, "a") as f:
            f.write(f"{step},{epoch},{train_loss},{eval_loss},{learning_rate}\n")


# Constants and configurations
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
NEW_MODEL = "llama-2-7b-user-manuals-new"
MAX_SEQ_LENGTH = 128
OUTPUT_DIR = "./results"

# LoRA parameters
LORA_R = 16
LORA_ALPHA = 64
LORA_DROPOUT = 0.1

# Training arguments
TRAIN_ARGS = {
    "output_dir": OUTPUT_DIR,
    "num_train_epochs": 1,
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "gradient_accumulation_steps": 1,
    "gradient_checkpointing": True,
    "max_grad_norm": 0.3,
    "learning_rate": 2e-4,
    "weight_decay": 0.01,
    "optim": "paged_adamw_32bit",
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.03,
    "group_by_length": True,
    "logging_steps": 5,
    "evaluation_strategy": "steps",
    "save_strategy": "steps",
    "load_best_model_at_end": True,
    "report_to": "tensorboard",
}

# Bits and Bytes configuration
BNB_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Clear CUDA cache
gc.collect()
torch.cuda.empty_cache()

# Setup model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=BNB_CONFIG,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Prepare dataset
df = pd.read_excel('/kaggle/input/dataset/Dataset_creation.xlsx')
df["formatted_instruction"] = df.apply(lambda x: f"<s>[INST] {x['Instructions']} [/INST] {x['Responses']} </s>", axis=1)
df = df.drop(columns=['Instructions', 'Responses'])

train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)

dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df)
})

# Initialize NLPAug augmenter
augmenter = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="substitute")
keyboard_error_augmenter = naw.SpellingAug()  # Assuming this introduces keyboard errors

def augment_instruction(instruction):
    # Apply character-based augmentation for keyboard errors
    augmented = keyboard_error_augmenter.augment(instruction, n=3)  # Generate multiple augmented versions
    # Join all augmented texts into a single string, or select one
    if isinstance(augmented, list):
        return " ".join(augmented)  # Join the list into a single string
    return augmented

def augment_question(question):
    # Extract the instruction part
    inst_start = question.find("[INST]") + 6
    inst_end = question.find("[/INST]")
    instruction = question[inst_start:inst_end].strip()
    
    # Augment the instruction
    augmented_instruction = augment_instruction(instruction)
    
    # Reconstruct the full question with augmented instruction
    return question[:inst_start] + augmented_instruction + question[inst_end:]

def tokenize_and_augment(examples):
    # Print original instructions
    print("Original instructions:")
    print(examples["formatted_instruction"][:5])  # Print first 5 examples for brevity
    
    # Augment instructions
    augmented_instructions = [augment_question(instruction) for instruction in examples["formatted_instruction"]]
    
    # Print augmented instructions
    print("Augmented instructions:")
    print(augmented_instructions[:5])  # Print first 5 examples for brevity
    
    # Tokenize augmented instructions
    return tokenizer(augmented_instructions, padding="max_length", truncation=True, max_length=MAX_SEQ_LENGTH)

# Tokenize and augment dataset
tokenized_dataset = dataset_dict.map(
    tokenize_and_augment,
    batched=True,
    remove_columns=dataset_dict["train"].column_names
)
# Setup LoRA configuration
peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
)

# Setup training arguments
training_args = TrainingArguments(**TRAIN_ARGS)

# Initialize the custom callback
log_callback = LogMetricsCallback(output_file="./logs/training_metrics.csv")

# Initialize trainer with the custom callback
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    peft_config=peft_config,
    dataset_text_field="formatted_instruction",
    max_seq_length=MAX_SEQ_LENGTH,
    tokenizer=tokenizer,
    args=training_args,
    callbacks=[log_callback],  # Add the custom callback here
)

# Train the model
trainer.train()

# Save the model
trainer.model.save_pretrained(NEW_MODEL)


In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/kaggle/input/log-file/training_metrics.csv')

# Convert columns to individual arrays
steps = df['step'].tolist()
epochs = df['epoch'].tolist()
train_loss = df['train_loss'].tolist()
eval_loss = df['eval_loss'].tolist()
learning_rate = df['learning_rate'].tolist()

# Print original arrays
print("Original arrays:")
print("Steps:", steps)
print("Epochs:", epochs)
print("Train Loss:", train_loss)
print("Eval Loss:", eval_loss)
print("Learning Rate:", learning_rate)


In [None]:
train_loss_filtered = [t for t in train_loss if pd.notna(t)]

eval_loss_filtered = [eval_loss[i] for i, e in enumerate(eval_loss) if pd.notna(e)]
learning_rate_filtered = [learning_rate[i] for i, l in enumerate(learning_rate) if pd.notna(l)]
unique_steps = list(set(steps))
unique_steps.sort()  # Optional: sort the list if needed
unique_epochs = list(set(epochs))
unique_epochs.sort()  # Optional: sort the list if needed

# Remove the last value from each list
if unique_steps:  # Check if the list is not empty
    unique_steps.pop()  # Remove the last value

if unique_epochs:  # Check if the list is not empty
    unique_epochs.pop()  # Remove the last value

# Print the updated lists
print("Updated Unique Steps:", unique_steps)
print("Updated Unique Epochs:", unique_epochs)

print("Train Loss:", train_loss_filtered)
print("Eval Loss:", eval_loss_filtered)
print("Learning Rate:", learning_rate_filtered)

In [None]:
import matplotlib.pyplot as plt

# Plot Train Loss vs Steps
plt.figure(figsize=(12, 6))

# First plot: Train Loss vs Steps
plt.subplot(1, 2, 1)
plt.plot(train_loss_filtered, unique_steps, marker='o', label='Train Loss')
plt.plot(eval_loss_filtered, unique_steps, marker='x', label='Eval Loss')
plt.xlabel('Loss')
plt.ylabel('global Steps')
plt.title('Train Loss and Eval Loss vs Steps')
plt.legend()
plt.grid(True)

# Second plot: Learning Rate vs Train Loss
plt.subplot(1, 2, 2)
plt.plot(train_loss_filtered, learning_rate_filtered, marker='o', color='r', label='Train Loss vs Learning Rate')
plt.xlabel('Train Loss')
plt.ylabel('Learning Rate')
plt.title('Learning Rate vs Train Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
#-----QLora parameters------#
lora_r = 16            # attention mechanism
lora_alpha = 64        # scaling purpose
lora_dropout = 0.1     # dropout probaility layer

#----Bits and bytes parameters-------#
load_4bit=True                      # Activate 4-bit precision base model loading
bnb_4bit_quant_type="nf4"           # Quantization type (fp4 or nf4)
bnb_4bit_compute_dtype=torch.float16    # Compute dtype for 4-bit base models
#compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
use_nested_quant = False

#----Training arguments parameters-----#
output_dir = "./results"               # output directory
num_train_epochs = 1                   # traning epochs
fp16 = False                           # Enable fp16/bf16 training (set bf16 to True with an A100)
bf16 = False
per_device_train_batch_size = 4        # Batch size per GPU for training
per_device_eval_batch_size = 4         # Batch size per GPU for evaluation
gradient_accumulation_steps = 1        # Number of update steps to accumulate the gradients for
gradient_checkpointing = True
max_grad_norm = 0.3                    # Maximum gradient normal (gradient clipping)
learning_rate = 2e-4
weight_decay = 0.01                   # Weight decay to apply to all layers except bias/LayerNorm weights
optim = "paged_adamw_32bit"            # optimizer
lr_scheduler_type = "cosine"           # Learning rate scheduler
max_steps = -1                         # Number of training steps (overrides num_train_epochs)
warmup_ratio = 0.03                    # Ratio of steps for a linear warmup (from 0 to learning rate)
group_by_length = True                 # Group sequences into batches with same length. Saves memory and speeds up training considerably
save_steps = 0                         # Save checkpoint every X updates steps
logging_steps = 5                     # Log every X updates steps




In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=load_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map='auto'
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
#tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
#tokenizer.pad_token = tokenizer.eos_token
#tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Move model to GPU and wrap with DataParallel if multiple GPUs are available


# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)
# peft_config = LoraConfig(
#     lora_alpha=lora_alpha,
#     lora_dropout=lora_dropout,
#     r=lora_r,
#     target_modules=["q_proj", "v_proj"],  # Example for Llama-2, adjust as needed for your model
#     bias="none",
#     task_type="CAUSAL_LM",
# )
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    eval_steps=5,  # Number of steps between evaluations
    load_best_model_at_end=True,
    save_strategy="epoch", 
    evaluation_strategy="epoch",
)

# df_read = pd.read_csv("/content/conversational_genAI.csv")
# Ensure the key exists in the DataFrame
#if "formatted_instruction" not in df_read.columns:
#    raise ValueError("The key 'formatted_instruction' does not exist in the DataFrame")
# Assuming you want to create a DatasetDict with the key "formatted_instruction"
#dataset_dict = DatasetDict({"formatted_instruction": dataset})

# save_metrics_callback = SaveMetricsCallback(save_path="training_metrics.csv")

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    peft_config=peft_config,
    dataset_text_field="formatted_instruction",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
#     callbacks=[save_metrics_callback]
    callbacks=[csv_logger]
)
import gc
gc.collect()
torch.cuda.empty_cache()

import os
import csv

csv_logger = CSVLoggerCallback(file_name='training_progress.csv')
trainer.train()

if isinstance(trainer.model, DataParallel):
    trainer.model.module.save_pretrained(new_model)
else:
    trainer.model.save_pretrained(new_model)
