In [1]:
pip install -U pandas transformers datasets peft gdown bitsandbytes

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import gdown

gdown.download(f"https://drive.google.com/uc?id=1Djk8nKbuJOXYXUiZc2c0eQtWRdn3MrJ2", "Train_ledgar.tsv", quiet=False)

gdown.download(f"https://drive.google.com/uc?id=1jCeARWkBbkW1iaTD4VKZYirI1ZzURXC4", "Test_ledgar.tsv", quiet=False)


Downloading...
From (original): https://drive.google.com/uc?id=1Djk8nKbuJOXYXUiZc2c0eQtWRdn3MrJ2
From (redirected): https://drive.google.com/uc?id=1Djk8nKbuJOXYXUiZc2c0eQtWRdn3MrJ2&confirm=t&uuid=6a8f5207-0f8e-495b-bfdc-9aec8dc9fbc5
To: /workspace/Train_ledgar.tsv
100%|██████████| 269M/269M [00:03<00:00, 77.8MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1jCeARWkBbkW1iaTD4VKZYirI1ZzURXC4
To: /workspace/Test_ledgar.tsv
100%|██████████| 42.1M/42.1M [00:00<00:00, 54.4MB/s]


'Test_ledgar.tsv'

In [3]:
import argparse

import pandas as pd
import torch
import transformers
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from peft import prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


In [4]:
CONFIG = {
    'falcon': ["query_key_value"],
    'llama': ["q_proj", "k_proj", "v_proj", "o_proj"],
    'mistral': ["q_proj", "k_proj", "v_proj", "o_proj"],
    'openai': ["q_proj", "k_proj", "v_proj", "o_proj"]
}

In [5]:
args = argparse.Namespace(
    model_name = 'meta-llama/Llama-2-7b-chat-hf',
    model_type = 'llama',
    batch_size = 2,
    eval_steps = 100,
    logging_steps = 100,
    model_output_path="output_dir",
)

In [6]:
from huggingface_hub import login

# Replace 'your_token_here' with your actual Hugging Face token
login(token="your_token_here")

In [7]:
tokenizer = AutoTokenizer.from_pretrained(args.model_name)

tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
data_path = ''

In [8]:
def tokenize_inputs(text_input):
    tok_full_prompt = tokenizer(text_input, padding=True, truncation=False)
    return tok_full_prompt


def tokenize_prompt(text_input):
    tok_full_prompt = tokenizer(text_input['instructions'], padding=True, truncation=False)
    return tok_full_prompt


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [9]:

dataset = pd.read_csv('Train_ledgar.tsv', sep='\t')

dataset = dataset[:100]
data = Dataset.from_pandas(dataset[['instructions']])
splits = data.train_test_split(test_size=0.2, shuffle=True, seed=42)
data = splits['train'].shuffle()
validation_data = splits["test"].shuffle()

data = data.map(tokenize_prompt, batch_size=args.batch_size)
validation_data = validation_data.map(tokenize_prompt, batch_size=args.batch_size)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [10]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    args.model_name,
    # load_in_8bit=True,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map="auto",
    # max_memory={0: "20GIB", 1: "20GIB"},
    # offload_folder="offload", offload_state_dict=True,
    trust_remote_code=True
)

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=CONFIG[args.model_type],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 8388608 || all params: 3508801536 || trainable%: 0.23907331075678143


In [11]:
training_args = transformers.TrainingArguments(
    gradient_accumulation_steps=1,
    # auto_find_batch_size=True,
    per_device_train_batch_size=args.batch_size,
    warmup_steps=1000,
    num_train_epochs=3,
    learning_rate=3e-4,
    fp16=False,
    eval_strategy="steps",
    eval_steps=args.eval_steps,
    save_steps=args.eval_steps,
    load_best_model_at_end=True,
    save_total_limit=4,
    logging_steps=args.logging_steps,
    output_dir=args.model_output_path if args.model_output_path else "output_dir",
    save_strategy='steps',
    optim="adamw_torch",
    warmup_ratio=0.05,
    report_to="none"
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    eval_dataset=validation_data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()
model.save_pretrained(f'finetuned_{str(args.model_name).replace("/", "_")}{args.model_output_path}')
with open(f'finetuned_{str(args.model_name).replace("/", "_")}{args.model_output_path}/stat.txt', 'w') as f:
    f.write(f'model used : {args.model_name}')

Step,Training Loss,Validation Loss
100,2.1136,1.394455


========================================Inferencing============================================

In [12]:
import argparse
import gc

import pandas as pd
import torch
from peft import PeftModel, PeftConfig
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM


In [13]:
args = argparse.Namespace(
    model_name = 'finetuned_meta-llama_Llama-2-7b-chat-hfoutput_dir',
    batch_size = 2,
)

config = PeftConfig.from_pretrained(args.model_name)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    config.base_model_name_or_path
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [14]:
model_inf = PeftModel.from_pretrained(model, args.model_name)

# set teh generation configuration params
gen_config = model_inf.generation_config
gen_config.max_new_tokens = 300
gen_config.temperature = 0.2
gen_config.top_p = 0.7
gen_config.num_return_sequences = 1
gen_config.pad_token_id = tokenizer.eos_token_id
gen_config.eos_token_id = tokenizer.eos_token_id



In [15]:
dataset = pd.read_csv('Test_ledgar.tsv', sep='\t')

In [16]:
out_list = []

num = 0
data_list = dataset['instructions'].to_list()

total_no = len(dataset)
with torch.inference_mode():
    with tqdm(total=total_no) as pbar:
        pbar.set_description('inference progress')
        for i in range(0, total_no, args.batch_size):
            prev_num = num
            num = num + args.batch_size
            data_batch = data_list[prev_num:num]
            print(f'processing : {num}/{total_no}')
            # encode the prompt
            encoding = tokenizer(data_batch, padding=True, truncation=False, return_tensors="pt").to(model.device)
            # do the inference
            outputs = model.generate(input_ids=encoding.input_ids, attention_mask=encoding.attention_mask,
                                     generation_config=gen_config)
            detach = outputs.detach().cpu().numpy()
            outputs = detach.tolist()
            out_list.extend([tokenizer.decode(out, skip_special_tokens=True) for out in outputs])
            pbar.update(args.batch_size)

predictions = pd.DataFrame({'predictions': out_list})
flat_model_name = str(args.model_name).replace('/', '')
predictions.to_csv('predictions.tsv', sep='\t', index=False)

inference progress:   0%|          | 0/60000 [00:00<?, ?it/s]

processing : 2/60000


inference progress:   0%|          | 0/60000 [01:22<?, ?it/s]


KeyboardInterrupt: 