In [None]:
# Import Libraries
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from peft import LoraConfig, peft_model, prepare_model_for_kbit_training, get_peft_model, PeftModel
import transformers
from trl import SFTTrainer
from datasets import Dataset
import bitsandbytes as bnb


In [None]:
def json_to_df(data):
    '''
    Convert data from json to dataframe.
    '''
    # Initialize lists to store 'original' and 'disfluent' values
    o = []
    d = []
    
    # Iterate through each key in the data dictionary
    for key in data.keys():
        # Append the 'original' and 'disfluent' values to their respective lists
        o.append(data[key]['original'])
        d.append(data[key]['disfluent'])

    # Create a DataFrame from the lists
    df = pd.DataFrame()
    df['original'] = o
    df['disfluent'] = d
    
    # Return the DataFrame
    return df

def preprocessing(sample):
    '''
    Applying chat template to the prompt.
    '''
    # Initialize a list to store the conversation
    conversation = []
    
    # Define the system, user, and assistant messages
    system_message = {"role": "system", "content" : system_prompt}
    user_message = {"role": "user", "content" : sample['disfluent']}
    assistant_message = {"role": "assistant", "content": sample['original']}

    # Insert the system message at the beginning of the conversation
    conversation.insert(0, system_message)
    # Append the user and assistant messages to the conversation
    conversation.append(user_message)
    conversation.append(assistant_message)
    
    # Return the conversation formatted by the tokenizer
    return {"content" : tokenizer.apply_chat_template(conversation, tokenize= False)}

def get_completion(query, model, tokenizer):
    '''
    Inferencing function.
    '''
    device = "cuda:0"  # Specify the device to use for computation
    
    # Initialize a list to store the conversation
    conversation = []
    
    # Define the system and user messages
    system_message = {"role": "system", "content" : system_prompt}
    user_message = {"role": "user", "content" : query}

    # Insert the system message at the beginning of the conversation
    conversation.insert(0, system_message)
    # Append the user message to the conversation
    conversation.append(user_message)
    
    # Format the conversation using the tokenizer
    prompt = tokenizer.apply_chat_template(conversation, tokenize= False)

    # Encode the prompt into tensors
    encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

    # Move the encoded inputs to the specified device
    model_inputs = encodeds.to(device)

    # Generate a response from the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=256, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    # Decode the generated response
    decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    # Return the decoded response and the generated IDs
    return decoded

def find_all_linear_names(model):
    '''
    Get the Lora modules
    '''
    cls = bnb.nn.Linear4bit  # Specify the class to look for
    lora_module_names = set()  # Initialize a set to store module names
    
    # Iterate through all named modules in the model
    for name, module in model.named_modules():
        # Check if the module is an instance of the specified class
        if isinstance(module, cls):
            # Split the module name and add the relevant part to the set
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        # Remove 'lm_head' from the set if present (needed for 16 bit)
        if 'lm_head' in lora_module_names:
            lora_module_names.remove('lm_head')
    
    # Return the list of module names
    return list(lora_module_names)


In [None]:
# Login to Huggingface to get the gated model
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Define system prompt
# system_prompt = "Read the following disfluent question and rewrite it in a fluent and clear manner, ensuring that it is understandable and correctly conveys the intended meaning. The output shoud be another question with the same meaning as the input."
system_prompt = "Rewrite the following disfluent question into a fluent and understandable question without answering it."

In [None]:
# Read json files.
with open("train.json" , "r") as f:
    train_json = json.loads(f.read())

with open("dev.json" , "r") as f:
    dev_json = json.loads(f.read())

In [None]:
# Convert json to dataframe
train = json_to_df(train_json)
dev = json_to_df(dev_json)

In [None]:
# Convert dataframe to datasets
dataset_train = Dataset.from_pandas(train)
dataset_dev = Dataset.from_pandas(dev)
dataset_train

In [None]:
# Quantization configs for QLora
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# Define the model ID for the Meta-Llama model
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Load the pre-trained causal language model with quantization configuration and device mapping
base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

# Load the tokenizer for the specified model, adding an end-of-sequence token
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

In [None]:
sample = train.iloc[0]
print (f"System Prompt: {system_prompt}")
print(f"Disfluent Question: {sample.disfluent}")
print(f"Required Response: {sample.original}")
print("Formatted Sample:")
preprocessing(sample)

In [None]:
# Apply the preprocessing function to the training dataset
train_data = dataset_train.map(
    preprocessing,  # Function to apply to each example
    batched=False,  # Process each example individually
    remove_columns=dataset_train.column_names  # Remove original columns after processing
)

# Apply the preprocessing function to the development dataset
dev_data = dataset_dev.map(
    preprocessing,  # Function to apply to each example
    batched=False,  # Process each example individually
    remove_columns=dataset_dev.column_names  # Remove original columns after processing
)


In [None]:
# Split the training data into training and validation sets
dataset = train_data.train_test_split(test_size=0.2)  # Use 20% of the data for validation

# Assign the training split to train_data
train_data = dataset["train"]

# Assign the validation split to val_data
val_data = dataset["test"]

In [None]:
# Enable gradient checkpointing to save memory during training
base_model.gradient_checkpointing_enable()

# Prepare the model for 4-bit training (quantized training)
model = prepare_model_for_kbit_training(base_model)

In [None]:
# Get modules for Lora
modules = find_all_linear_names(model)
print(modules)

In [None]:
# Configure the LoRA (Low-Rank Adaptation) settings
lora_config = LoraConfig(
    r=16,  # Rank of the low-rank adaptation
    lora_alpha=32,  # Scaling factor for the low-rank adaptation
    target_modules=modules,  # Modules to apply LoRA to
    lora_dropout=0.06,  # Dropout rate for LoRA
    bias="none",  # Bias configuration
    task_type="CAUSAL_LM"  # Task type (Causal Language Modeling)
)

# Apply the LoRA configuration to the model
model = get_peft_model(model, lora_config)

In [None]:
# Get the number of trainable and total parameters in the model
trainable, total = model.get_nb_trainable_parameters()

# Print the number of trainable and total parameters, and the percentage of trainable parameters
print(f"trainable: {trainable} | total {total} | Percentage: {trainable/total*100:.4f}%")

In [None]:
# Set the padding token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Clear the CUDA cache to free up memory
torch.cuda.empty_cache()

# Define the training arguments for the model
training_arguments = transformers.TrainingArguments(
    output_dir="llama3",  # Directory to save the model outputs
    per_device_train_batch_size=4,  # Batch size for training
    per_gpu_eval_batch_size=4,  # Batch size for evaluation
    gradient_accumulation_steps=2,  # Number of steps to accumulate gradients
    gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
    save_strategy="epoch",  # Save the model at the end of each epoch
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    learning_rate=2e-4,  # Learning rate for training
    max_grad_norm=0.3,  # Maximum gradient norm for clipping
    warmup_ratio=0.3,  # Warmup ratio for learning rate scheduler
    lr_scheduler_type='constant',  # Type of learning rate scheduler
    bf16=True,  # Use bfloat16 precision
    tf32=True,  # Use TensorFloat-32 precision
    num_train_epochs=5,  # Number of training epochs
    save_total_limit=1,  # Limit the total number of saved checkpoints
    optim="paged_adamw_8bit",  # Optimizer to use
)

# Initialize the trainer with the specified arguments and datasets
trainer = SFTTrainer(
    model=model,  # Model to train
    train_dataset=train_data,  # Training dataset
    eval_dataset=val_data,  # Evaluation dataset
    dataset_text_field="content",  # Field in the dataset containing text
    peft_config=lora_config,  # LoRA configuration
    args=training_arguments,  # Training arguments
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),  # Data collator for language modeling
)


In [None]:
# Start the training process
model.config.use_cache = False
trainer.train()

In [None]:
new_model = "model" 
# Save the fine-tuned adaptor
trainer.model.save_pretrained(new_model)


In [None]:
# Merge the model's weights and unload unnecessary components
model = model.merge_and_unload()

# Save the merged model to the specified directory with safe serialization
model.save_pretrained("merged_model_llama", safe_serialization=True)

# Save the tokenizer to the same directory
tokenizer.save_pretrained("merged_model_llama")

# Set the padding token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Set the padding side to "right"
tokenizer.padding_side = "right"