## Use powerful LLM to generate synthetic answer for a given question - context pair.

This will create a dataset of question - context - answer triplets.

In [1]:
# provide project root path
# ProjectRoot = "/content/drive/MyDrive/UMich Capstone/NoteBooks/"
ProjectRoot = "/home/sangram/Tutorbot_capstone/git_hub/Tutorbot/"
DatasetRoot = ProjectRoot + "Dataset/"

In [2]:
try:
    import transformers
except ImportError:
    !pip install transformers

try:
    import accelerate
except ImportError:
    !pip install accelerate


In [3]:
from transformers import __version__
print(__version__)

4.43.4


In [4]:
import pandas as pd
import numpy as np
import json
import os
import re
from tqdm import tqdm

# LLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import accelerate

In [5]:
# load context and question test set which was created by doc2query
train_df = pd.read_csv(DatasetRoot + 'q_a_trainset.csv')
test_df = pd.read_csv(DatasetRoot + 'q_a_testset.csv')


In [6]:
# loading full article from json file
with open(DatasetRoot + 'raw_knowledge.json', 'r') as f:
    raw_text_json = json.load(f)


In [7]:
raw_df = pd.DataFrame(list(raw_text_json.items()), columns=['raw_para_id', 'raw_text'])
raw_df['raw_para_id'] = raw_df['raw_para_id'].astype('int64')

In [8]:
# create dataframe of raw, summarized paragraphs and question
train_df = train_df.merge(raw_df, left_on='raw_para_id', right_on='raw_para_id', how='left')
test_df = test_df.merge(raw_df, left_on='raw_para_id', right_on='raw_para_id', how='left')

In [9]:
if torch.cuda.is_available():
    torch.set_default_device("cuda")
    print("CUDA is available!!")
else:
    raise RuntimeError("CUDA is not available!! LLM cannot run, rerun with GPU")

CUDA is available!!


In [10]:
# Provide Huggingface Login token below to leverage powerful LLMs
os.environ['HUGGINGFACE_TOKEN'] = 'Add your token'
if os.environ['HUGGINGFACE_TOKEN'] == 'Add your token':
    raise ValueError("Token not provided")
    
token = os.environ.get('HUGGINGFACE_TOKEN')

In [11]:
!huggingface-cli login --token $token

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/sangram/.cache/huggingface/token
Login successful


### Generate ground truths

Note: Huggingface models are cached under /home/sangram/.cache/huggingface/hub/models--*

In [12]:
!nvidia-smi


Wed Aug  7 18:52:02 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-PCIE-16GB           On  |   00000000:3B:00.0 Off |                    0 |
| N/A   33C    P0             24W /  250W |       3MiB /  16384MiB |      0%   E. Process |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                     

In [13]:
# https://huggingface.co/stabilityai/stable-cascade/discussions/11
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [14]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu.


In [15]:
def generate_prompt(context, question):
    prompt_template = """
You are an expert in understanding and interpreting provided text contexts. Given a context and a question, your task is to generate an accurate and informative answer based on the provided context. Here is the structure:

1. **Context:** The detailed text or passage that contains the information needed to answer the question.
2. **Question:** A specific question that needs to be answered based on the context.

Please make sure your response is clear, concise, and directly addresses the question. If the context does not contain sufficient information to answer the question, say I don't know.

**Context:**
{context}

**Question:**
{question}

The response is a valid JSON with fields `explanation` and `response`.
"""
    return prompt_template.format(context=context, question=question)

In [16]:
def AskLLM(context, question):
    prompt = generate_prompt(context, question)

    input_ids = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        add_generation_prompt=True,
        return_tensors="pt",
        return_attention_mask=True
    ).to(model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        input_ids,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    response = outputs[0][input_ids.shape[-1]:]
    answer = tokenizer.decode(response, skip_special_tokens=True)

    # Extract the answer
    json_match = re.search(r'{.*}', answer, re.DOTALL)

    if json_match:
        # extract and parse JSON
        json_string = json_match.group(0)
        response_dict = json.loads(json_string)
        final_answer = response_dict['response']
    else:
        final_answer = "I don't know."

    return final_answer

In [17]:
tqdm.pandas()
# generate ground truths for test set
test_df['Final_answer'] = test_df.progress_apply(lambda row: AskLLM(row.raw_text, row.question), axis=1)

  0%|          | 0/24 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
 33%|███▎      | 8/24 [07:14<13:34, 50.90s/it]

Answer not found for question: How would I describe Data Science in the following sentence:
Based on the provided context, I would describe Data Science as "an applied field growing out of traditional statistics".


100%|██████████| 24/24 [28:50<00:00, 72.11s/it]


In [18]:
# drop raw text since it's present in other dataset file
test_df = test_df.drop(columns='raw_text')

# save to csv files
test_df.to_csv(DatasetRoot + 'q_a_testset.csv', index=False)

In [None]:
tqdm.pandas()
# generate ground truths for train set
train_df['Final_answer'] = train_df.progress_apply(lambda row: AskLLM(row.raw_text, row.question), axis=1)

 46%|████▌     | 44/96 [50:16<1:07:43, 78.15s/it]

In [None]:
# drop raw text since it's present in other dataset file
train_df = train_df.drop(columns='raw_text')

# save to csv files
train_df.to_csv(DatasetRoot + 'q_a_trainset.csv', index=False)