In [1]:
import sys
import os

import argparse
import time
import json
from datetime import date

import torch
#import lightning as L
#from lightning.pytorch.callbacks import ModelCheckpoint,EarlyStopping

#Transformers
import transformers
import tqdm
import bitsandbytes as bnb
from transformers import AutoModelForCausalLM , AutoTokenizer
from transformers import pipeline, set_seed
from transformers import get_linear_schedule_with_warmup, AdamW
from transformers import AutoConfig
from transformers import BitsAndBytesConfig
#from lightning.pytorch.loggers import TensorBoardLogger

#Dataset
from datasets import load_dataset

#PEFT
from peft import LoraConfig
from peft import PeftConfig
from peft import PeftModel
from peft import get_peft_model
from peft import prepare_model_for_kbit_training


import warnings
warnings.filterwarnings("ignore")

torch.set_float32_matmul_precision('medium')
torch.cuda.empty_cache()







2024-03-21 17:51:42.723235: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Define a function to print the number of trainable parameters in the given model
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params} || All params: {all_param} || Trainable %: {100 * trainable_params / all_param}")

def tokenize_input(df,tokenizer,tokenizer_chapter_max_length,tokenizer_summary_max_length):

    prompt_start = "Summarize the following : \n"
    prompt_end = "\n\nSummary:"

    prompt = [prompt_start + dialogue + prompt_end for dialogue in df["chapter"]]

    df["input_ids"] = tokenizer(prompt, max_length=tokenizer_chapter_max_length , padding="max_length" , truncation=True , return_tensors="pt").input_ids
    df["labels"] = tokenizer(df["summary_text"],max_length=tokenizer_summary_max_length , padding="max_length" , truncation=True , return_tensors="pt").input_ids

    return df

In [3]:
cache_dir = "/work/LitArt/nair/cache/" 
log_path = "/work/LitArt/nair/outdir/"

tokenizer_chapter_max_length = 1024
tokenizer_summary_max_length = 256
model = "meta-llama/Llama-2-7b-hf"
tokenizer_name = "meta-llama/Llama-2-7b-hf"





today = date.today()


log_path = log_path+model.replace("/","-")+"-" +str(today)+"-"+time.strftime("%H:%M:%S", time.localtime())
#logger = TensorBoardLogger(log_path, name="my_model")


In [4]:
from transformers import AutoTokenizer
#Bits and Bytes config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, #4bit quantizaition - load_in_4bit is used to load models in 4-bit quantization 
bnb_4bit_use_double_quant=True, #nested quantization technique for even greater memory efficiency without sacrificing performance. This technique has proven beneficial, especially when fine-tuning large models
bnb_4bit_quant_type="nf4", #quantization type used is 4 bit Normal Float Quantization- The NF4 data type is designed for weights initialized using a normal distribution
bnb_4bit_compute_dtype=torch.bfloat16, #modify the data type used during computation. This can result in speed improvements. 
)
model = AutoModelForCausalLM.from_pretrained(model,
                                                    device_map="auto",
                                                    trust_remote_code=True,
                                                    quantization_config=bnb_config,
                                                    cache_dir=cache_dir)


tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,cache_dir=cache_dir)


# Set the padding token of the tokenizer to its end-of-sentence token
tokenizer.pad_token = tokenizer.eos_token

tokenizer.add_special_tokens({'pad_token': '<PAD>'})

# Enable gradient checkpointing for the model. Gradient checkpointing is a technique used to reduce the memory consumption during the backward pas. Instead of storing all intermediate activations in the forward pass (which is what's typically done to compute gradients in the backward pass), gradient checkpointing stores only a subset of them
model.gradient_checkpointing_enable() 

# Prepare the model for k-bit training . Applies some preprocessing to the model to prepare it for training.
model = prepare_model_for_kbit_training(model)


#If only targeting attention blocks of the model
#target_modules = ["q_proj", "v_proj"]

#If targeting all linear layers
target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']

    
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules = target_modules,
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

model.add_adapter(lora_config)

#base_model = get_peft_model(base_model, lora_config)

# Print the number of trainable parameters in the model
print_trainable_parameters(model)



os.makedirs(log_path, exist_ok=True)
file_path = os.path.join(log_path, 'number_of_trainable_para.txt')  

with open(file_path, 'w') as file:
    file.write(str(print_trainable_parameters(model)))
    
        
    






Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Trainable params: 40554496 || All params: 3540967424 || Trainable %: 1.1452942414869276
Trainable params: 40554496 || All params: 3540967424 || Trainable %: 1.1452942414869276


In [5]:
data = load_dataset('csv', 
                    data_files={
                        'train': "/work/LitArt/data/chunked_dataset/train_dataset_with_summaries.csv",
                        'test': "/work/LitArt/data/chunked_dataset/test_dataset_with_summaries.csv",
                        'val':"/work/LitArt/data/chunked_dataset/validation_dataset_with_summaries.csv"})

In [6]:
data


DatasetDict({
    train: Dataset({
        features: ['chapter', 'human_summary', '__index_level_0__', 'summary_text'],
        num_rows: 10668
    })
    test: Dataset({
        features: ['chapter', 'human_summary', '__index_level_0__', 'summary_text'],
        num_rows: 1614
    })
    val: Dataset({
        features: ['chapter', 'human_summary', '__index_level_0__', 'summary_text'],
        num_rows: 1548
    })
})

In [7]:
tokenized_dataset = data["train"].shuffle().map(tokenize_input, batched=True, fn_kwargs={"tokenizer": tokenizer, "tokenizer_chapter_max_length": tokenizer_chapter_max_length,"tokenizer_summary_max_length":tokenizer_summary_max_length})
tokenized_dataset = tokenized_dataset.remove_columns(['chapter', 'human_summary', '__index_level_0__', 'summary_text'])

Map:   0%|          | 0/10668 [00:00<?, ? examples/s]

In [8]:
from transformers import TrainingArguments

#Training Parameters
batch_size = 1
epochs= 5
output_dir = f"llama-7b-qlora-Capstone-project"
per_device_train_batch_size = batch_size
gradient_accumulation_steps = 4
optim = 'adamw_hf' #"paged_adamw_32bit" #"paged_adamw_8bit"
save_steps = 10
save_total_limit=3
logging_steps = 30
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 2000
warmup_ratio = 0.03
lr_scheduler_type = "cosine" #"cosine"
epochs=1, 

training_arguments = TrainingArguments(
    output_dir=log_path,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    num_train_epochs=epochs, 
    #save_steps=save_steps,
    save_total_limit=save_total_limit,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    save_strategy='epoch',
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    gradient_checkpointing=True,
    #push_to_hub=True,
)


In [9]:
from trl import SFTTrainer

def formatting_func(example):
    text = f"### USER: Summarize the following text : {example['chapter']}\n### ASSISTANT: {example['summary_text']}"
    return text



In [10]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=data["train"],
    packing=True,
    #dataset_text_field="id",
    tokenizer=tokenizer,
    max_seq_length=1024,
    formatting_func=formatting_func,
)


Generating train split: 0 examples [00:00, ? examples/s]

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [11]:
def save_hyperparameters(log_path, quantization_config, lora_config , training_arguments):
    import os
    os.makedirs(log_path, exist_ok=True)
    
    file_path = os.path.join(log_path, 'hyperparameters.txt')  
    
    with open(file_path, 'w') as file:
        file.write(str(quantization_config))
        file.write(str(lora_config))
        file.write(str(training_arguments))
        
    
save_hyperparameters(log_path, bnb_config , lora_config , training_arguments)



In [None]:
trainer.train() #  [ 861/1000 1:30:16 < 14:36, 0.16 it/s, Epoch 5.76/7] at 348pm

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
30,2.9075
60,2.7158


In [None]:
model.save_pretrained(log_path)

In [None]:
trainer.save_model(log_path)

In [None]:
model_dir = log_path

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
#model_dir = '/work/LitArt/nair/outdir/meta-llama-Llama-2-7b-hf-2024-03-21-14:17:13'

tokenizer = AutoTokenizer.from_pretrained(model_dir)

quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    quantization_config=quantization_config,
    #adapter_kwargs={"revision": "09487e6ffdcc75838b10b6138b6149c36183164e"}
)


In [None]:


def generate_response(chapter : str) -> str:
    prompt =  f"""### USER: Summarize the following text : ' {chapter}' ### Assistant:  """.strip()
    inputs = tokenizer(prompt, return_tensors="pt").to(0)
    outputs = model.generate(inputs.input_ids, max_new_tokens=500, do_sample=False)
    return(tokenizer.decode(outputs[0], skip_special_tokens=False))


'''
	encoding = tokenizer(prompt, return_tensors = "pt").to(DEVICE)
	#with torch.inference_mode():
    with torch.no_grad():
		outputs = model.generate(
			input_ids=encoding.input_ids,
			attention_mask=encoding.attention_mask,
			generation_config=generation_config,
		)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	#assistant_start =  "<assistant>:"
	#response_start = response.find(assistant_start)
	#return response[response_start + len(assistant_start) : ].strip()

    return response.strip()

'''


chapter = '''In the dim light of the old library, Anna's fingers traced the edges of a leather-bound book, its title embossed in gold but faded with time. She could hear the distant echo of the storm outside, a reminder of the world she had momentarily escaped. Around her, shelves towered like ancient guardians, filled with stories waiting to be told.

As she flipped through the pages, a loose sheet of paper slipped out and floated to the ground. It was a map, old and hand-drawn, marking a location in the heart of the city that Anna couldn't recall ever hearing about. Her curiosity piqued, she decided then that the storm would not deter her adventure. Tucking the map into her coat, she stepped out into the rain, the library door closing with a soft thud behind her.

Navigating through the slick streets, Anna's mind raced with possibilities of what she might find. The map led her to an alley she had passed a thousand times but never noticed. Hidden away was a door, as if waiting for her all these years. She pushed it open, the creak of the hinges echoing into the unseen depths beyond.

Inside, the air was thick with the scent of old books and whispered secrets. A single lamp illuminated a room that seemed out of place and time, filled with artifacts and tomes that whispered of magic and mystery. At the center, a figure turned from a cluttered table, their eyes meeting Anna's with a mix of surprise and expectation.

"You've found your way," the figure began, their voice a blend of warmth and intrigue. "But remember, what you seek also seeks you. The journey ahead is yours alone to embrace."

Before Anna could reply, the world around her began to blur, the edges of reality seeming to fray. The room, the figure, and the artifacts faded into a swirl of colors and whispers.

And then, she was standing back in the alley, the door now just a wall, the map in her hand turned to dust. The rain had stopped, and the city seemed unaware of the journey she had just embarked upon. Anna looked around, the weight of the adventure settling in her heart, knowing her story was far from over.
'''
print(generate_response(chapter))




In [None]:
import pandas as pd
d= pd.DataFrame(trainer.state.log_history)

In [None]:
file_path = os.path.join(log_path, 'log.csv')  


In [None]:
d.to_csv(file_path)

In [None]:

file_path = os.path.join(log_path, 'summary.txt')  

with open(file_path, 'w') as file:
    file.write(str(generate_response(chapter)))
    
        

In [None]:
data["val"]['chapter'][100]



In [None]:
print (generate_response(data["val"]['chapter'][100]))

