In [None]:
!pip install transformers
!pip install peft
!pip install trl
!pip install torch
!pip install bitsandbytes
!pip install accelerate
#!pip install wandb
!pip install datasets

In [None]:
import os
import torch
#import wandb
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    TrainingArguments, 
    logging
)
from peft import LoraConfig, get_peft_model
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login
from trl import SFTTrainer, setup_chat_format
import bitsandbytes as bnb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("hf_token")
login(token=hf_token)

base_model = "Qwen/Qwen2.5-3B-Instruct"
new_model = "qwen2.5-3b-it-tuned"

if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
    !pip install -qqq flash-attn  
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

In [None]:
import os
os.environ['WANDB_DISABLED']="true"

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
peft_config = LoraConfig(
    r=8,
    task_type="CAUSAL_LM",
    target_modules= ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"]
)

tokenizer.chat_template = None # Reset the chat template to prevent duplication error

model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [None]:
import re

dataset = load_dataset("csv",data_files= "/kaggle/input/100q-training/combined_100_qna.csv")
dataset = dataset.shuffle(seed=42)

def clean_text(text):
    """Clean the input text by removing extra spaces and trimming."""
    return re.sub(r'\s+', ' ', text).strip()

def custom_chat_template(batch):
    """Creates chat-style templates for each entry in the batch."""
    
    # Clean the text fields in each batch
    question = clean_text(batch['question'])
    answer = clean_text(batch['answer'])
    context = clean_text(batch['context'])
    reasoning = clean_text(batch['reasoning'])

    # Structure the data in a conversation format
    row_json = [
        {"role": "user", "content": question},
        {"role": "system", "content": context},
        {"role": "assistant", "content": f"{answer} + ' ' + {reasoning}"}
    ]
    
    # Debug: Check the row_json structure
    #print("Row JSON: ", row_json)

    # Apply the tokenizer's chat template (assuming it handles JSON-like structures)
    try:
        batch['prompt'] = tokenizer.apply_chat_template(row_json, tokenize=False)
    except Exception as e:
        print("Error in applying chat template:", e)
    
    # Return the modified batch with the new 'prompt' field
    return batch

print('~~~~~>')
print('Applying custom chat template')
print('~~~~~>')

# Apply the custom chat template to each batch in the dataset
dataset = dataset.map(custom_chat_template)

print('~~~~~>')
print('Success')
print('~~~~~>')


In [None]:
dataset['train']


In [None]:
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset["train"],
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 2,
        max_steps = 15,
        learning_rate = 2e-4,
        fp16 = True,
        logging_steps = 1,
        do_eval=False,
        output_dir = new_model,
        optim = "paged_adamw_8bit"
        
    ),
    peft_config = peft_config,
    dataset_text_field = 'prompt',
    tokenizer = tokenizer

)
model.config.use_cache = False

In [None]:
trainer.train()

In [None]:
# Clear all files and directories in /kaggle/working
!rm -rf /kaggle/working/*

trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

trainer.model.save_pretrained('./my_model')
tokenizer.save_pretrained('./my_model')

In [None]:
from peft import PeftModel

tokenizer = AutoTokenizer.from_pretrained(new_model, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto"
)
model.resize_token_embeddings(len(tokenizer))

model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
message = [
    {"role": "user", "content": "What is the GSTIN of the company billing to Manjrekar & Sons?"},
    {"role": "system", "content": '{"Key_value_data": [{"Invoice No.": "RTR354", "Invoice Date": "2-Dec-20", "Invoice From-Company Name": "National Enterprises", "Invoice From-Company GSTIN": "29AACCT3705E000", "Invoice From-Company Email": "", "Invoice From-Company Phone No.": "", "Invoice From-Company Address": "HSR Layout\nBangalore", "Invoice To-Company Name": "Manjrekar & Sons", "Invoice To-Company GSTIN": "27BJPJK0301P1ZT", "Invoice To-Company Email": "", "Invoice To-Company Phone No.": "", "Invoice To-Company Address": " ", "Gross Amount": "500.00", "Tax Amount": "60.00", "Discount": "", "Total Invoice Amount": "560.00"}], "Table_data_1": [{"S No.": "1", "Description": "12MM**", "HSN/SAC": "1004", "Quantity": "10", "Rate": "50.00", "Unit": "No", "Amount": "500.00"}], "Table_data_2": [{"HSN/SAC": "1004", "Value of Supply": "", "Taxable Value": "500.00", "Central Tax Amount": "", "State Tax Amount": "", "Total Tax Amount": "60.00"}]}'}
]

prompt = tokenizer.apply_chat_template(message, tokenize = False, add_generation_prompt = True)
inputs = tokenizer(prompt, return_tensors = 'pt').to('cuda')
outputs = model.generate(**inputs, max_new_tokens = 100)

response = tokenizer.decode(outputs[0]).split('assistant')[-1].strip()
print(response)

In [None]:
import pandas as pd
df = pd.read_csv('/kaggle/input/inference-test-set/inference-test-set.csv')
questions_df = df['question'] # Assumes the file has a 'Question' column

context = '{"Key_value_data": [{"Invoice No.": "4545 / 2017-18", "Invoice Date": "15/09/2017", "Invoice From-Company Name": "BLUE SKY INDIA LIMITED", "Invoice From-Company GSTIN": "078DFGHJ412421SF", "Invoice From-Company Email": "test@gmail.com", "Invoice From-Company Phone No.": "+91645641236485", "Invoice From-Company Address": "Plot No. 44, Sector-20, Dwarka- 110075", "Invoice To-Company Name": "TechGuruPlus", "Invoice To-Company GSTIN": "656564566345454", "Invoice To-Company Email": "", "Invoice To-Company Phone No.": "01356656565", "Invoice To-Company Address": "C-172, Okhla Industrial Area, \nNew Delhi-110020", "Gross Amount": "10960.00", "Tax Amount": "1952.00", "Discount": "", "Total Invoice Amount": "12932.00"}], "Table_data_1": [{"S No.": "45", "Description": "ITEM NAME 1", "HSN/SAC": "4556", "Quantity": "20.00", "Rate": "", "Unit": "", "Amount": "900.00"}, {"S No.": "23", "Description": "ITEM NAME 2", "HSN/SAC": "8978", "Quantity": "40.00", "Rate": "", "Unit": "", "Amount": "920.00"}, {"S No.": "56", "Description": "ITEM NAME 3", "HSN/SAC": "5645", "Quantity": "50.00", "Rate": "", "Unit": "", "Amount": "2800.00"}, {"S No.": "89", "Description": "ITEM NAME 4", "HSN/SAC": "2312", "Quantity": "60.00", "Rate": "", "Unit": "", "Amount": "5340.00"}]}'

responses = []

# Iterate over each question and generate a response
for question in questions_df:
    
    
    # Prepare the message for the model
    message = [
        {"role": "user", "content": question},
        {"role": "system", "content": context}
    ]

    # Format the prompt using the chat template
    prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)

    # Tokenize and run the model
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=100)

    # Decode the model's output
    response = tokenizer.decode(outputs[0]).split('assistant')[-1].strip()

    # Append the response to the list
    responses.append(response)

output_df = pd.DataFrame({
    'question': questions_df,
    'Model Response': responses
})
# Save the updated DataFrame to a new Excel/CSV file
output_df.to_excel('qwen_2.5_3b_it_responses.xlsx', index=False)

print("Responses sheet has been generated")