In [2]:
import sys
import os

import argparse
import time
import json
from datetime import date

import torch
#import lightning as L
#from lightning.pytorch.callbacks import ModelCheckpoint,EarlyStopping

#Transformers
import transformers
import bitsandbytes as bnb
from transformers import AutoModelForCausalLM , AutoTokenizer
from transformers import pipeline, set_seed
from transformers import get_linear_schedule_with_warmup, AdamW
from transformers import AutoConfig
from transformers import BitsAndBytesConfig
#from lightning.pytorch.loggers import TensorBoardLogger

#Dataset
from datasets import load_dataset

#PEFT
from peft import LoraConfig
from peft import PeftConfig
from peft import PeftModel
from peft import get_peft_model
from peft import prepare_model_for_kbit_training


import warnings
warnings.filterwarnings("ignore")

torch.set_float32_matmul_precision('medium')
torch.cuda.empty_cache()







2024-03-21 11:36:03.505718: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Define a function to print the number of trainable parameters in the given model
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params} || All params: {all_param} || Trainable %: {100 * trainable_params / all_param}")

def tokenize_input(df,tokenizer,tokenizer_chapter_max_length,tokenizer_summary_max_length):

    prompt_start = "Summarize the following : \n"
    prompt_end = "\n\nSummary:"

    prompt = [prompt_start + dialogue + prompt_end for dialogue in df["chapter"]]

    df["input_ids"] = tokenizer(prompt, max_length=tokenizer_chapter_max_length , padding="max_length" , truncation=True , return_tensors="pt").input_ids
    df["labels"] = tokenizer(df["summary_text"],max_length=tokenizer_summary_max_length , padding="max_length" , truncation=True , return_tensors="pt").input_ids

    return df

In [4]:
cache_dir = "/work/LitArt/nair/cache/" 
log_path = "/work/LitArt/nair/outdir/"

tokenizer_chapter_max_length = 1024
tokenizer_summary_max_length = 256
model = "meta-llama/Llama-2-7b-hf"
tokenizer_name = "meta-llama/Llama-2-7b-hf"





today = date.today()

#Training Parameters
batch_size = 2
epochs = 1
log_path = log_path+model.replace("/","-")+"-" +str(today)+"-"+time.strftime("%H:%M:%S", time.localtime())
#logger = TensorBoardLogger(log_path, name="my_model")


In [5]:
from transformers import AutoTokenizer
cache_dir = "/work/LitArt/cache" 

#Bits and Bytes config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True, #4bit quantizaition - load_in_4bit is used to load models in 4-bit quantization 
bnb_4bit_use_double_quant=True, #nested quantization technique for even greater memory efficiency without sacrificing performance. This technique has proven beneficial, especially when fine-tuning large models
bnb_4bit_quant_type="nf4", #quantization type used is 4 bit Normal Float Quantization- The NF4 data type is designed for weights initialized using a normal distribution
bnb_4bit_compute_dtype=torch.bfloat16, #modify the data type used during computation. This can result in speed improvements. 
)
model = AutoModelForCausalLM.from_pretrained(model,
                                                    device_map="auto",
                                                    trust_remote_code=True,
                                                    quantization_config=bnb_config,
                                                    cache_dir=cache_dir)


tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,cache_dir=cache_dir)


# Set the padding token of the tokenizer to its end-of-sentence token
tokenizer.pad_token = tokenizer.eos_token

tokenizer.add_special_tokens({'pad_token': '<PAD>'})

# Enable gradient checkpointing for the model. Gradient checkpointing is a technique used to reduce the memory consumption during the backward pas. Instead of storing all intermediate activations in the forward pass (which is what's typically done to compute gradients in the backward pass), gradient checkpointing stores only a subset of them
base_model.gradient_checkpointing_enable() 

# Prepare the model for k-bit training . Applies some preprocessing to the model to prepare it for training.
base_model = prepare_model_for_kbit_training(base_model)


lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", 
                    "o_proj", 
                    "k_proj", 
                    "v_proj", 
                     "gate_proj", "up_proj", "down_proj"],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

model.add_adapter(lora_config)

#base_model = get_peft_model(base_model, config)

# Print the number of trainable parameters in the model
print_trainable_parameters(base_model)





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

NameError: name 'base_model' is not defined

In [None]:
data = load_dataset('csv', 
                    data_files={
                        'train': "/work/LitArt/data/chunked_dataset/train_dataset_with_summaries.csv",
                        'test': "/work/LitArt/data/chunked_dataset/test_dataset_with_summaries.csv",
                        'val':"/work/LitArt/data/chunked_dataset/validation_dataset_with_summaries.csv"})

In [None]:
data


In [None]:
tokenized_dataset = data["train"].shuffle().map(tokenize_input, batched=True, fn_kwargs={"tokenizer": tokenizer, "tokenizer_chapter_max_length": tokenizer_chapter_max_length,"tokenizer_summary_max_length":tokenizer_summary_max_length})
tokenized_dataset = tokenized_dataset.remove_columns(['chapter', 'human_summary', '__index_level_0__', 'summary_text'])

In [None]:
from transformers import TrainingArguments


output_dir = f"llama-7b-qlora-Capstone-project"
per_device_train_batch_size = batch_size
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit" #"paged_adamw_8bit"
save_steps = 10
save_total_limit=3
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps =20 #1000
warmup_ratio = 0.03
lr_scheduler_type = "constant" #"cosine"

training_arguments = TrainingArguments(
    output_dir=log_path,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    #save_steps=save_steps,
    save_total_limit=save_total_limit,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    #save_strategy='epoch',
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    gradient_checkpointing=True,
    #push_to_hub=True,
)


In [None]:
from trl import SFTTrainer

def formatting_func(example):
    text = f"### USER: {example['chapter']}\n### ASSISTANT: {example['summary_text']}"
    return text



In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=data["train"],
    packing=True,
    #dataset_text_field="id",
    tokenizer=tokenizer,
    max_seq_length=1024,
    formatting_func=formatting_func,
)


In [None]:
training_arguments

In [22]:
trainer.train()

Step,Training Loss
10,2.6508
20,2.6237


TrainOutput(global_step=20, training_loss=2.637235641479492, metrics={'train_runtime': 509.5598, 'train_samples_per_second': 0.314, 'train_steps_per_second': 0.039, 'total_flos': 6514932543651840.0, 'train_loss': 2.637235641479492, 'epoch': 0.09})

In [23]:
model.save_pretrained(log_path)

In [24]:
trainer.save_model(log_path)

In [None]:
load_base_model

'/work/LitArt/verma/tiiuae-falcon-7b-2024-03-18-20:27:30'

In [25]:
model_dir = log_path

In [26]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)


tokenizer = AutoTokenizer.from_pretrained(model_dir)

quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    quantization_config=quantization_config,
    #adapter_kwargs={"revision": "09487e6ffdcc75838b10b6138b6149c36183164e"}
)


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [35]:


def generate_response(chapter : str) -> str:
    prompt =  f"""### USER:' {chapter}' ### Assistant:  """.strip()
    inputs = tokenizer(prompt, return_tensors="pt").to(0)
    outputs = model.generate(inputs.input_ids, max_new_tokens=250, do_sample=False)
    return(tokenizer.decode(outputs[0], skip_special_tokens=False))


'''
	encoding = tokenizer(prompt, return_tensors = "pt").to(DEVICE)
	#with torch.inference_mode():
    with torch.no_grad():
		outputs = model.generate(
			input_ids=encoding.input_ids,
			attention_mask=encoding.attention_mask,
			generation_config=generation_config,
		)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	#assistant_start =  "<assistant>:"
	#response_start = response.find(assistant_start)
	#return response[response_start + len(assistant_start) : ].strip()

    return response.strip()

'''


chapter = "text to be summarised"
print (generate_response(chapter))




<s> ### USER:' text to be summarised' ### Assistant: the text is a summary of the events that occurred in the city of bethlehem on the night of december 24th 1999 the summary begins with the arrival of a group of strangers in the city led by a man named john who claims to be the messiah the strangers are welcomed by the townspeople but are soon met with resistance from the town s leaders the strangers are forced to leave the city but not before they have left a trail of destruction in their wake the summary ends with the strangers continuing their journey towards their destination the summary is a powerful and thought-provoking piece of literature that leaves the reader with many unanswered questions and a sense of unease</s>


In [None]:
# Load the configuration for the trained model
config = PeftConfig.from_pretrained(model_dir)

In [None]:
# Load the trained model using the loaded configuration and other parameters
model = AutoModelForCausalLM.from_pretrained(
	config.base_model_name_or_path,
	return_dict=True,
	quantization_config=bnb_config,
	device_map="auto",
	trust_remote_code=True,
)

In [None]:
# Load the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)



In [None]:
# Set the padding token of the tokenizer to its end-of-sentence token
tokenizer.pad_token = tokenizer.eos_token

In [None]:
#Inference
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id





In [None]:
def generate_response(chapter : str) -> str:
    
    
	prompt =  f"""
    "Summarize the following : \n" {chapter}
    \n\nSummary: 
    """.strip()
	encoding = tokenizer(prompt, return_tensors = "pt").to(DEVICE)
	#with torch.inference_mode():
    with torch.no_grad():
		outputs = model.generate(
			input_ids=encoding.input_ids,
			attention_mask=encoding.attention_mask,
			generation_config=generation_config,
		)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	#assistant_start =  "<assistant>:"
	#response_start = response.find(assistant_start)
	#return response[response_start + len(assistant_start) : ].strip()

    return response.strip()




In [None]:
#prompt

chapter = "text to be summarised"
print (generate_response(chapter))


