## Dependencies

In [None]:
%%capture
%pip install -U transformers
%pip install -U datasets
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U bitsandbytes
%pip install -U wandb

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

## Hugging face authorization

In [None]:
from huggingface_hub import login
login(token="")

In [None]:
dataset_name = "data.json"
new_model = "llama-3-8b-chat-vitv1"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

In [None]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B",
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="eager"
)

In [None]:
model, tokenizer = setup_chat_format(model, tokenizer)

In [None]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [None]:
dataset = load_dataset("json", data_files="data.json", split="all")

def format_chat_template(row):
    # Update to use 'question' and 'answer' keys
    row_json = [{"role": "user", "content": row["question"]},
                {"role": "assistant", "content": row["answer"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

dataset # Access formatted text from the dataset

In [None]:
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
wandb.init(project="VIT-BOT")

In [None]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=7,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=1e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [None]:
device = torch.device("cuda")
model = model.to(device)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

In [None]:
trainer.train()

## Inference

In [None]:
messages = [
    {
        "role": "user",
        "content": "Tell me about VIT's leadership team?"
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False,
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True,
                   truncation=True).to("cuda")

outputs = model.generate(**inputs, max_length=100,
                         num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

In [None]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)

## Load and infer

In [None]:
!pip install transformers

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("casarulez/merged-vit-bot")
model = AutoModelForCausalLM.from_pretrained("casarulez/merged-vit-bot")

In [None]:
model.to("cuda")

In [None]:
messages = [
    {
        "role": "user",
        "content": "Who is Viswanathan?"
    }
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False,
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True,
                   truncation=True).to("cuda")

outputs = model.generate(**inputs, max_length=100,
                         num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

## Processed outputs

In [None]:
def remove_unwanted_content(response):
    # Convert the response to lowercase for case-insensitive search
    lower_response = response.lower()

    # Find the position of the first occurrence of "question" or "what"
    first_question_index = lower_response.find("question")
    first_what_index = lower_response.find("what")

    # Find the second occurrence of "question" or "what"
    second_question_index = lower_response.find("question", first_question_index + len("question")) if first_question_index != -1 else -1
    second_what_index = lower_response.find("what", first_what_index + len("what")) if first_what_index != -1 else -1

    # Determine which of the two occurs second (question or what)
    if second_question_index != -1 and second_what_index != -1:
        second_occurrence_index = min(second_question_index, second_what_index)
    elif second_question_index != -1:
        second_occurrence_index = second_question_index
    elif second_what_index != -1:
        second_occurrence_index = second_what_index
    else:
        # If no second occurrence, return the response as is
        processed_response = response.strip()
        return processed_response

    # Keep the content up to the second occurrence and remove everything after it
    processed_response = response[:second_occurrence_index].strip()

    return processed_response

In [None]:
import torch
import re


device = "cuda" if torch.cuda.is_available() else "cpu"


messages = [
    {
        "role": "system",
        "content": (
            "You are VIT-BOT, a virtual assistant designed to answer student questions "
            "regarding university policies, academic procedures, and campus facilities at VIT. "
            "Answer each prompt concisely and directly, focusing only on university-relevant information. "
            "Provide only the answer to the current query without adding any follow-up or additional information."
        )
    },
    {
        "role": "user",
        "content": "What are honours and minors at VIT?"
    }
]


user_input = messages[1]["content"].strip().lower()


predefined_responses = {
    "who are you?": "I am VIT-BOT, your personal chat assistant to provide you information about the university.",
    "what can you do?": "I can help you with any queries regarding VIT"
}


if user_input in predefined_responses:
    response = predefined_responses[user_input]
else:

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)


    inputs = {key: value.to(device) for key, value in inputs.items()}


    if tokenizer.pad_token_id == tokenizer.eos_token_id:
        attention_mask = (inputs["input_ids"] != tokenizer.pad_token_id).long()
        inputs["attention_mask"] = attention_mask

    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=150,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

cleaned_response = remove_unwanted_content(response)

print(cleaned_response)
