In [None]:
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U bitsandbytes
!pip install -q -U accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch
import json

In [None]:
model = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"
pipeline = pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [None]:
from time import time

def query_model_batch(task_instruction, user_messages, temperature=0.7, max_length=1024):
    batch_prompts = []
    for um in user_messages:
        user_mssg = task_instruction + um + " Answer:"
        messages = [
            {"role": "user", "content": user_mssg}
        ]
        prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
        batch_prompts.append(prompt)

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    # Generate sequences in batch
    sequences = pipeline(
        batch_prompts,
        do_sample=True,
        top_p=0.9,
        temperature=temperature,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=pipeline.model.config.eos_token_id
    )
    
    responses = [seq[0]['generated_text'] for seq in sequences]
    return responses

In [None]:
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# load the data
def read_json(file_path: str) -> dict:
    """Read a json file and return a dict."""
    with open(file_path) as f:
        data = json.load(f)
    return data

# custom torch dataset
class RAGDataset(Dataset):
    def __init__(self, prompts, answers):
        self.prompts = prompts
        self.answers = answers

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        return self.prompts[idx], self.answers[idx]
    
task_instruction = """
You have to answer complex questions based on the provided contexts.
Respond with as few tokens as possible. You don't need to explain your answer. 
Don't add any extra information.
"""

def get_llm_response(data_path_llm_prompts, save_to_file=None):
    llm_prompts_dict = read_json(data_path_llm_prompts)
    
    # extract info.
    prompts = [v["prompt"] for k, v in llm_prompts_dict.items()]
    answers = [v["answer"] for k, v in llm_prompts_dict.items()]
    
    # create dataset and dataloader
    dataset = RAGDataset(prompts, answers)
    loader = DataLoader(dataset, batch_size=8, shuffle=False, num_workers=4)
    
    # get response from LLM in batches
    llm_responses = {}
    for batch_prompts, batch_answers in tqdm(loader):
        batch_responses = query_model_batch(task_instruction, batch_prompts, temperature=0.1, max_length=300)

        for i in range(len(batch_responses)):
            prompt = batch_prompts[i]
            response = batch_responses[i]
            answer = batch_answers[i]

            llm_responses[prompt] = [response, answer]
    
    # save response to json
    if save_to_file:
        with open(f"/kaggle/working/{save_to_file}.json", 'w') as f:
            json.dump(llm_responses, f, indent=4)

In [None]:
get_llm_response(data_path_llm_prompts="path-to-prompts", save_to_file="save-name")