In [1]:
import sys
import os

import argparse
import time
import json
from datetime import date

import torch

#Transformers
import transformers
import bitsandbytes as bnb
from transformers import AutoModelForCausalLM , AutoTokenizer
from transformers import pipeline, set_seed
from transformers import get_linear_schedule_with_warmup, AdamW
from transformers import AutoConfig
from transformers import BitsAndBytesConfig

#Dataset
from datasets import load_dataset

#PEFT
from peft import LoraConfig
from peft import PeftConfig
from peft import PeftModel
from peft import get_peft_model
from peft import prepare_model_for_kbit_training


import warnings
warnings.filterwarnings("ignore")

torch.set_float32_matmul_precision('medium')
torch.cuda.empty_cache()

2024-03-21 11:12:07.712551: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Define a function to print the number of trainable parameters in the given model
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params} || All params: {all_param} || Trainable %: {100 * trainable_params / all_param}")

def tokenize_input(df,tokenizer,tokenizer_chapter_max_length,tokenizer_summary_max_length):

    prompt_start = "Summarize the following : \n"
    prompt_end = "\n\nSummary:"

    prompt = [prompt_start + dialogue + prompt_end for dialogue in df["chapter"]]

    df["input_ids"] = tokenizer(prompt, max_length=tokenizer_chapter_max_length , padding="max_length" , truncation=True , return_tensors="pt").input_ids
    df["labels"] = tokenizer(df["summary_text"],max_length=tokenizer_summary_max_length , padding="max_length" , truncation=True , return_tensors="pt").input_ids

    return df

In [3]:
cache_dir = "/work/LitArt/nair/cache/" 
log_path = "/work/LitArt/nair/outdir/"

tokenizer_chapter_max_length = 1024
tokenizer_summary_max_length = 256
base_model_name = "tiiuae/falcon-7b"
tokenizer_name = "tiiuae/falcon-7b"

today = date.today()

#Training Parameters
batch_size = 2
epochs = 1
log_path = log_path+base_model_name.replace("/","-")+"-" +str(today)+"-"+time.strftime("%H:%M:%S", time.localtime())
#logger = TensorBoardLogger(log_path, name="my_model")


In [4]:

# Load the 7b llama model
model_id = "tiiuae/falcon-7b"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# Load model
base_model = AutoModelForCausalLM.from_pretrained(model_id, 
                                                  quantization_config=quantization_config, 
                                                  cache_dir=cache_dir,
                                                  device_map="auto",


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,cache_dir=cache_dir)
# Set the padding token of the tokenizer to its end-of-sentence token
tokenizer.pad_token = tokenizer.eos_token

# Enable gradient checkpointing for the model. Gradient checkpointing is a technique used to reduce the memory consumption during the backward pas. Instead of storing all intermediate activations in the forward pass (which is what's typically done to compute gradients in the backward pass), gradient checkpointing stores only a subset of them
base_model.gradient_checkpointing_enable() 

# Prepare the model for k-bit training . Applies some preprocessing to the model to prepare it for training.
base_model = prepare_model_for_kbit_training(base_model)

lora_config = LoraConfig(
    r=18, #The rank of decomposition r is << min(d,k). The default of r is 8.
    lora_alpha=32,#∆W is scaled by α/r where α is a constant. When optimizing with Adam, tuning α is similar as tuning the learning rate.
    target_modules=["query_key_value"], #Modules to Apply LoRA to target_modules. You can select specific modules to fine-tune.
    lora_dropout=0.05,#Dropout Probability for LoRA Layers #to reduce overfitting
    bias="none", #Bias Type for Lora. Bias can be ‘none’, ‘all’ or ‘lora_only’. If ‘all’ or ‘lora_only’, the corresponding biases will be updated during training. 
    task_type= "CAUSAL_LM", #Task Type
    )

base_model = get_peft_model(base_model, lora_config)

# Print the number of trainable parameters in the model
print_trainable_parameters(base_model)

Trainable params: 5308416 || All params: 3614053248 || Trainable %: 0.1468826172646364


In [6]:
data = load_dataset('csv', 
                    data_files={
                        'train': "/work/LitArt/data/chunked_dataset/train_dataset_with_summaries.csv",
                        'test': "/work/LitArt/data/chunked_dataset/test_dataset_with_summaries.csv",
                        'val':"/work/LitArt/data/chunked_dataset/validation_dataset_with_summaries.csv"})

In [7]:
data

DatasetDict({
    train: Dataset({
        features: ['chapter', 'human_summary', '__index_level_0__', 'summary_text'],
        num_rows: 10668
    })
    test: Dataset({
        features: ['chapter', 'human_summary', '__index_level_0__', 'summary_text'],
        num_rows: 1614
    })
    val: Dataset({
        features: ['chapter', 'human_summary', '__index_level_0__', 'summary_text'],
        num_rows: 1548
    })
})

In [8]:
tokenized_dataset = data["train"].shuffle().map(tokenize_input, batched=True, fn_kwargs={"tokenizer": tokenizer, "tokenizer_chapter_max_length": tokenizer_chapter_max_length,"tokenizer_summary_max_length":tokenizer_summary_max_length})
tokenized_dataset = tokenized_dataset.remove_columns(['chapter', 'human_summary', '__index_level_0__', 'summary_text'])

Map:   0%|          | 0/10668 [00:00<?, ? examples/s]

In [9]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size = batch_size ,     # Specifies the batch size for training on each device (GPU).
    #auto_find_batch_size=True,      # Uncommenting this would let the library automatically find an optimal batch size.
    gradient_accumulation_steps=2,   # Number of forward and backward passes to accumulate gradients before performing an optimizer step.
    # This effectively multiplies the batch size by this number without increasing memory usage.
    num_train_epochs=epochs,              # Specifies the total number of training epochs.
    learning_rate=2e-4,              # Specifies the learning rate for the optimizer.
    fp16=True,     # Enables mixed precision training (fp16) which can speed up training and reduce memory usage.
    save_total_limit=3,              # Limits the total number of model checkpoints saved. Only the last 3 checkpoints are saved.
    logging_steps=10,                 # Specifies how often to log training updates. 
    output_dir=log_path ,          # Directory where the model checkpoints and training outputs will be saved.
    max_steps = 200 ,                 # Limits the total number of training steps. Training will stop after 80 steps regardless of epochs.
    save_strategy='epoch',    # Uncommenting this would change the strategy for saving model checkpoints. 'epoch' means save after each epoch.
    optim="paged_adamw_8bit",     # Specifies the optimizer to use. it's set to a specific variant of AdamW.
    lr_scheduler_type = 'cosine',     # Specifies the learning rate scheduler type. 'cosine' means it uses cosine annealing.
    warmup_ratio = 0.05,           # Specifies the ratio of total steps for the learning rate warmup phase.
)

In [10]:
trainer = transformers.Trainer(
    model=base_model,
    train_dataset=tokenized_dataset,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
base_model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,3.291
20,3.2485
30,3.0477
40,3.2325
50,3.0333
60,3.1085
70,3.1417
80,3.1282
90,3.0355
100,3.2073


TrainOutput(global_step=200, training_loss=3.1530834293365477, metrics={'train_runtime': 372.1503, 'train_samples_per_second': 2.15, 'train_steps_per_second': 0.537, 'total_flos': 3.25954440265728e+16, 'train_loss': 3.1530834293365477, 'epoch': 0.07})

In [11]:
base_model.save_pretrained(log_path)

In [12]:
trainer.save_model(log_path)

In [13]:
model_dir = log_path

In [14]:
# Load the configuration for the trained model
config = PeftConfig.from_pretrained(model_dir)

In [15]:
# Load the trained model using the loaded configuration and other parameters
model = AutoModelForCausalLM.from_pretrained(
	config.base_model_name_or_path,
	return_dict=True,
	quantization_config=quantization_config,
	device_map="auto",
	trust_remote_code=True,
)

configuration_py:   0%|          | 0.00/7.16k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- configuration_py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.




modeling_py:   0%|          | 0.00/56.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- modeling_py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

In [16]:
# Load the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)



tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

In [17]:
# Set the padding token of the tokenizer to its end-of-sentence token
tokenizer.pad_token = tokenizer.eos_token

In [18]:
#Inference
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id





In [20]:
def generate_response(chapter : str) -> str:
    
    
	prompt =  f"""
    "Summarize the following : \n" {chapter}
    \n\nSummary: 
    """.strip()
	encoding = tokenizer(prompt, return_tensors = "pt").to(DEVICE)
	#with torch.inference_mode():
    with torch.no_grad():
		outputs = model.generate(
			input_ids=encoding.input_ids,
			attention_mask=encoding.attention_mask,
			generation_config=generation_config,
		)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	#assistant_start =  "<assistant>:"
	#response_start = response.find(assistant_start)
	#return response[response_start + len(assistant_start) : ].strip()

    return response.strip()




IndentationError: unindent does not match any outer indentation level (<tokenize>, line 10)

In [None]:
#prompt

chapter = "text to be summarised"
print (generate_response(chapter))


