In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel


# Load the base Llama 2 7B model
base_model_id = "NousResearch/Llama-2-7b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id, 
    quantization_config=bnb_config, 
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True)

# Load the QLoRA adapter
path_llm_model = "amazon_lora_llm/llama2-7b-AmazonVPC-finetune/checkpoint-500"
ft_model = PeftModel.from_pretrained(base_model, path_llm_model)

def clean_output(output):
    # Example of cleaning up output - adjust as needed based on the actual output format
    answer_start = output.find("### Answer:")
    if answer_start != -1:
        return output[answer_start + len("### Answer:"):].strip()
    return output

# Function to generate LLM answer
def generate_llm_answer(question):
    #question = question + " #"
    print('question processing')
    model_input = tokenizer(question, return_tensors="pt").to("cuda")
    ft_model.eval()
    with torch.no_grad():
        raw_output = tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=300)[0], skip_special_tokens=True)
        return clean_output(raw_output)

In [None]:
# Read CSV file
df = pd.read_csv("test.csv")

# Generate LLM answers for each question
df['llm_answer'] = df['question'].apply(generate_llm_answer)

# Save the new DataFrame to a CSV file
df.to_csv("llm_output_test.csv", index=False)


In [None]:
pd.set_option('display.max_colwidth', None)  # This will remove the truncation of column content
df