## Use powerful LLM to generate synthetic answer for a given question - context pair.

This will create a dataset of question - context - answer triplets.

In [2]:
# provide project root path
ProjectRoot = "/content/drive/MyDrive/UMich Capstone/NoteBooks/"
DatasetRoot = ProjectRoot + "Dataset/"

In [3]:
try:
    import transformers
except ImportError:
    !pip install transformers

try:
    import accelerate
except ImportError:
    !pip install accelerate


In [4]:
from transformers import __version__
print(__version__)

4.42.4


In [5]:
import pandas as pd
import numpy as np
import json
import os
import re
from tqdm import tqdm

# LLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import accelerate

In [6]:
# load context and question test set which was created by doc2query
train_df = pd.read_csv(DatasetRoot + 'q_a_trainset.csv')
test_df = pd.read_csv(DatasetRoot + 'q_a_testset.csv')


In [7]:
# loading full article from json file
with open(DatasetRoot + 'raw_knowledge.json', 'r') as f:
    raw_text_json = json.load(f)


In [8]:
raw_df = pd.DataFrame(list(raw_text_json.items()), columns=['raw_para_id', 'raw_text'])
raw_df['raw_para_id'] = raw_df['raw_para_id'].astype('int64')

In [9]:
# create dataframe of raw, summarized paragraphs and question
train_df = train_df.merge(raw_df, left_on='raw_para_id', right_on='raw_para_id', how='left')
test_df = test_df.merge(raw_df, left_on='raw_para_id', right_on='raw_para_id', how='left')

In [10]:
if torch.cuda.is_available():
    torch.set_default_device("cuda")
    print("CUDA is available!!")
else:
    raise RuntimeError("CUDA is not available!! LLM cannot run, rerun with GPU")

CUDA is available!!


In [11]:
# Provide Huggingface Login token below to leverage powerful LLMs
os.environ['HUGGINGFACE_TOKEN'] = 'Add your token'
if os.environ['HUGGINGFACE_TOKEN'] == 'Add your token':
    raise ValueError("Token not provided")

token = os.environ.get('HUGGINGFACE_TOKEN')

In [12]:
!huggingface-cli login --token $token

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Generate ground truths

Note: Huggingface models are cached under `/home/<username>/.cache/huggingface/hub/models--*` incase need to be deleted to free up disk space

In [13]:
!nvidia-smi


Sun Aug 11 20:08:43 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [14]:
# https://huggingface.co/stabilityai/stable-cascade/discussions/11
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [15]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]



In [16]:
# NOTE: Following prompts are created with the aid of ChatGPT

def generate_prompt(context, question):
    prompt_template = """
You are an expert in understanding and interpreting provided text contexts. Given a context and a question, your task is to generate an accurate and informative answer based on the provided context. Here is the structure:

1. **Context:** The detailed text or passage that contains the information needed to answer the question.
2. **Question:** A specific question that needs to be answered based on the context.

Please make sure your response is clear, concise, and directly addresses the question. If the context does not contain sufficient information to answer the question, say I don't know.

**Context:**
{context}

**Question:**
{question}

The response is a valid JSON with fields `explanation` and `response`.
"""
    return prompt_template.format(context=context, question=question)

In [17]:
def AskLLM(context, question):
    prompt = generate_prompt(context, question)

    input_ids = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        add_generation_prompt=True,
        return_tensors="pt",
        return_attention_mask=True
    ).to(model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        input_ids,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    response = outputs[0][input_ids.shape[-1]:]
    answer = tokenizer.decode(response, skip_special_tokens=True)

    # Extract the answer
    json_match = re.search(r'{.*}', answer, re.DOTALL)

    if json_match:
        # extract and parse JSON
        json_string = json_match.group(0)
        response_dict = json.loads(json_string)
        final_answer = response_dict['response']
    else:
        final_answer = "I don't know."

    return final_answer

In [18]:
tqdm.pandas()
# generate ground truths for test set
test_df['Final_answer'] = test_df.progress_apply(lambda row: AskLLM(row.raw_text, row.question), axis=1)

  0%|          | 0/24 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 24/24 [36:34<00:00, 91.46s/it]


In [19]:
# drop raw text since it's present in other dataset file
test_df = test_df.drop(columns='raw_text')

# save to csv files
test_df.to_csv(DatasetRoot + 'q_a_testset.csv', index=False)

In [None]:
tqdm.pandas()
# generate ground truths for train set
train_df['Final_answer'] = train_df.progress_apply(lambda row: AskLLM(row.raw_text, row.question), axis=1)

 28%|██▊       | 27/96 [41:04<1:29:37, 77.93s/it]

In [None]:
# drop raw text since it's present in other dataset file
train_df = train_df.drop(columns='raw_text')

# save to csv files
train_df.to_csv(DatasetRoot + 'q_a_trainset.csv', index=False)