In [1]:
from llama_index.core import SimpleDirectoryReader, ServiceContext, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from transformers import pipeline
from langchain_text_splitters import RecursiveCharacterTextSplitter
import torch
from langchain_community.document_loaders import DirectoryLoader
from huggingface_hub import login
import json
from tqdm import tqdm
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
login("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")

In [15]:
file_path = 'Annotation/Annotated_rawdata/new_data'
loader = DirectoryLoader(file_path,
                                     glob='*.txt',
                                     show_progress=True,
                                     use_multithreading=True)
documents = loader.load()

100%|██████████| 6/6 [00:00<00:00,  7.23it/s]


In [16]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1024, chunk_overlap = 100)
chunks = [chunk for doc in documents
                  for chunk in text_splitter.split_documents([doc])]


In [17]:
chunks_content = [chunk.page_content for chunk in chunks]

In [18]:
#need to sample the chunks for context in the future
len(chunks_content)

405

In [19]:
if file_path.split('/')[-1] in ['generalinfo_cmu', 'generalinfo_pittsburgh', 'eventspittsburgh']:
    context_sample = random.sample(chunks_content, 100)
else:
    context_sample = random.sample(chunks_content,50)

In [20]:
print(len(context_sample))

50


In [21]:
def prompt_generation(examples, context, num_questions) -> str:
    prompt = "You are an AI assistant trained for data annotation.\n"
    prompt += "Your task is to generate **question-answer pairs** based on the given factual context. You will be given a context passage, and you will select a fact from the context, then ask a question from it, and then provide the answer to the asked question based on the selected fact. "
    prompt += "Ensure the questions are well-formed, unambiguous, and directly answerable using the provided context. Avoid speculative, open-ended questions, or generate any new context. "
    prompt += "These are some examples with questions and answers as well as the fact that help answer that question: \n"
    for i, example in enumerate(examples):
        prompt += f"Question: {example['question']}\n"
        prompt += f"Answer: {example['answer']}\n"
        prompt += "\n"
    prompt += f"Here is the context for the question and answer generation task: {context}\n\n"
    prompt += f"Extracted {num_questions} question-answer pairs based on the given context using the following format: \n"
    prompt += "Question: \n"
    prompt += "Answer: \n\n"
    prompt += "Only return the question and answer you generated. Do not include any additional information."
    return prompt

In [22]:
def result_formated(result) -> str:
    answers = result.split('Do not include any additional information.\n\n')[-1]
    answers = answers.replace('Question','<Generated>Question')
    return answers

In [23]:
torch.cuda.empty_cache()

In [24]:
augmented_data = ''
with open('Annotation/example.json', 'r') as f:
    examples = json.load(f)
pipe = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2", max_new_tokens=512, torch_dtype=torch.bfloat16, device_map="cuda")
for context in tqdm(context_sample, desc="Inference process"):
    prompt = prompt_generation(examples, context, 1)
    results = pipe(prompt)
    answers = result_formated(results[0]['generated_text'])
    augmented_data +=  answers
    augmented_data += '\n\n'
with open("Annotation/Annotated_data"+file_path.split('/')[-1]+'.txt', 'w') as f:
    f.write(augmented_data)

100%|██████████| 6/6 [17:20<00:00, 173.45s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [01:50<00:00, 36.67s/it]
Device set to use cuda
Inference process:   0%|          | 0/50 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Inference process:   2%|▏         | 1/50 [00:02<01:43,  2.12s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Inference process:   4%|▍         | 2/50 [00:04<01:36,  2.02s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Inference process:   6%|▌         | 3/50 [00:05<01:19,  1.69s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Inference process:   8%|▊         | 4/50 [00:06<01:12,  1.57s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Inference process:  10%|█         | 5/50 [00:08<01:05,  1.45s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Inference process:  12%|█▏        | 6/50 [00:09<01:07,  1.54s/it]Se

In [50]:
augmented_data = ''
with open('Annotation/example.json', 'r') as f:
    examples = json.load(f)
for context in tqdm(context_sample, desc="Inference process"):
    prompt = prompt_generation(examples, context, 1)
    results = pipe(prompt)
    answers = result_formated(results[0]['generated_text'])
    augmented_data +=  answers
    augmented_data += '\n\n'
with open("Annotation/Annotated_data"+file_path.split('/')[-1]+'.txt', 'w') as f:
    f.write(augmented_data)

Inference process:   0%|          | 0/50 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Inference process:   2%|▏         | 1/50 [00:01<01:02,  1.28s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Inference process:   4%|▍         | 2/50 [00:03<01:28,  1.84s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Inference process:   6%|▌         | 3/50 [00:05<01:39,  2.13s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Inference process:   8%|▊         | 4/50 [00:08<01:44,  2.28s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Inference process:  10%|█         | 5/50 [00:10<01:31,  2.04s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Inference process:  12%|█▏        | 6/50 [00:11<01:27,  1.99s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Inference process:  14%|█▍        | 7/50 [00:13<01:23,  1.94s/it]Setting `pad

In [None]:
pipe = pipeline("text-generation", model= "mistralai/Mistral-7B-Instruct-v0.2", max_new_tokens=512, torch_dtype=torch.bfloat16, device_map="cuda")

Downloading shards: 100%|██████████| 3/3 [01:07<00:00, 22.43s/it]
100%|██████████| 3/3 [02:28<00:00, 49.62s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.09s/it]
Device set to use cuda:0


In [None]:
fact = ["The first known European to enter the region was the French explorer Robert de La Salle from Quebec during his 1669 expedition down the Ohio River.", "In 1859, the Clinton and Soho iron furnaces introduced coke-fire smelting to the region.", "The NHL's Pittsburgh Penguins have played in Pittsburgh since the team's founding in 1967."]
question = ["Who was the first known person to enter the Pittsburgh?", "When did coke-fire smelting introduced in Pittsburgh?", "When did Penguins funded?"]
answer = ["The first person to enter the Pittsburgh was the French explorer Robert de La Salle", "Coke-fire smelting was introduced in Pittsburgh in 1859 by the Clinton and Soho iron furnaces.","Penguins were funded in 1967."]
#formatted in json and save it
data = []
for i in range(len(fact)):
    data.append({"fact": fact[i], "question": question[i], "answer": answer[i]})
import json
with open('Annotation/example.json', 'w') as f:
    json.dump(data, f)

In [19]:
def prompt_generation(examples, context, num_questions):
    prompt = "You are an AI assistant trained for data annotation.\n"
    prompt += "Your task is to generate **fact-question-answer pairs** based on the given factual context. You will be given a context passage, and you will select a fact from the context, then ask a question from it, and then provide the answer to the asked question based on the selected fact. "
    prompt += "Ensure the questions are well-formed, unambiguous, and directly answerable using the provided context. Avoid speculative, open-ended questions, or generate any new context. "
    prompt += "These are some examples with questions and answers as well as the fact that help answer that question: \n"
    for i, example in enumerate(examples):
        prompt += f"Fact: {example['fact']}\n"
        prompt += f"Question: {example['question']}\n"
        prompt += f"Answer: {example['answer']}\n"
        prompt += "\n"
    prompt += f"Here is the context for the fact, question, and answer generation task: {context}\n\n"
    prompt += f"Extracted {num_questions} fact-question-answer pairs based on the given context using the following format: \n"
    prompt += "Fact: \n"
    prompt += "Question: \n"
    prompt += "Answer: \n\n"
    prompt += "Only return the fact, question, and answer you generated. Do not include any additional information."
    return prompt

In [23]:
def prompt_generation(examples, context, num_questions):
    prompt = "You are an AI assistant trained for data annotation.\n"
    prompt += "Your task is to generate **question-answer pairs** based on the given factual context. You will be given a context passage, and you will select a fact from the context, then ask a question from it, and then provide the answer to the asked question based on the selected fact. "
    prompt += "Ensure the questions are well-formed, unambiguous, and directly answerable using the provided context. Avoid speculative, open-ended questions, or generate any new context. "
    prompt += "These are some examples with questions and answers as well as the fact that help answer that question: \n"
    for i, example in enumerate(examples):
        # prompt += f"Fact: {example['fact']}\n"
        prompt += f"Question: {example['question']}\n"
        prompt += f"Answer: {example['answer']}\n"
        prompt += "\n"
    prompt += f"Here is the context for the question and answer generation task: {context}\n\n"
    prompt += f"Extracted {num_questions} question-answer pairs based on the given context using the following format: \n"
    # prompt += "Fact: \n"
    prompt += "Question: \n"
    prompt += "Answer: \n\n"
    prompt += "Only return the question and answer you generated. Do not include any additional information."
    return prompt

In [28]:
with open('Annotation/example.json', 'r') as f:
    examples = json.load(f)
prompt = prompt_generation(examples, chunks_content[0], 2)
print(prompt)

You are an AI assistant trained for data annotation.
Your task is to generate **question-answer pairs** based on the given factual context. You will be given a context passage, and you will select a fact from the context, then ask a question from it, and then provide the answer to the asked question based on the selected fact. Ensure the questions are well-formed, unambiguous, and directly answerable using the provided context. Avoid speculative, open-ended questions, or generate any new context. These are some examples with questions and answers as well as the fact that help answer that question: 
Question: Who was the first known person to enter the Pittsburgh?
Answer: The first person to enter the Pittsburgh was the French explorer Robert de La Salle

Question: When did coke-fire smelting introduced in Pittsburgh?
Answer: Coke-fire smelting was introduced in Pittsburgh in 1859 by the Clinton and Soho iron furnaces.

Question: When did Penguins funded?
Answer: Penguins were funded in

In [29]:
results = pipe(prompt)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [30]:
results[0]['generated_text']

'You are an AI assistant trained for data annotation.\nYour task is to generate **question-answer pairs** based on the given factual context. You will be given a context passage, and you will select a fact from the context, then ask a question from it, and then provide the answer to the asked question based on the selected fact. Ensure the questions are well-formed, unambiguous, and directly answerable using the provided context. Avoid speculative, open-ended questions, or generate any new context. These are some examples with questions and answers as well as the fact that help answer that question: \nQuestion: Who was the first known person to enter the Pittsburgh?\nAnswer: The first person to enter the Pittsburgh was the French explorer Robert de La Salle\n\nQuestion: When did coke-fire smelting introduced in Pittsburgh?\nAnswer: Coke-fire smelting was introduced in Pittsburgh in 1859 by the Clinton and Soho iron furnaces.\n\nQuestion: When did Penguins funded?\nAnswer: Penguins were

In [27]:
chunks_content[0]

'Contents\n\nHistory of Pittsburgh\n\nThe history of Pittsburgh began with centuries of Native American civilization in the modern Pittsburgh region, known as Jaödeogë’ in the Seneca language.[1] Eventually, European explorers encountered the strategic confluence where the Allegheny and Monongahela Rivers meet to form the Ohio, which leads to the Mississippi River. The area became a battleground when France and Great Britain fought for control in the 1750s. When the British were victorious, the French ceded control of territories east of the Mississippi.'

In [4]:
file_path = "raw_data/citypittsburgh/citypittsburgh_conbined.txt"
with open(file_path, "r") as f:
    data = f.readlines()
print(data[0])

## ADA-Disability-Rights



In [None]:
documents = SimpleDirectoryReader(
    input_dir="raw_data/description_pages", 
    required_exts=[".txt"],  # Ensure only .txt files are read
    recursive=True  # Enable recursive search inside subdirectories
).load_data()

3

In [None]:
embed_model = HuggingFaceEmbedding(model_name='all-mpnet-base-v2')
llm = HuggingFaceLLM(model_name = )

In [None]:
# Load the document(s)
documents = SimpleDirectoryReader("your_folder_path").load_data()

# Use an open-source embedding model
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load an open-source LLM for Q&A generation (Mistral-7B or smaller)
llm = HuggingFaceLLM(
    model_name="mistralai/Mistral-7B-Instruct",
    tokenizer_name="mistralai/Mistral-7B-Instruct",
    device_map="auto",
    model_kwargs={"torch_dtype": torch.float16}  # Efficient inference
)

# Create service context
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)

# Build the index
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
query_engine = index.as_query_engine()


In [None]:
folder_path = '11711-anlp-spring2025-hw2/raw_data/events_pittsburgh_cmu'


In [14]:
penguins_content = ''
with open('Annotation/Annotated_rawdata/new_data/penguins_roster_with_injuries.txt', 'r') as f:
    content = f.read()
data = json.loads(content)
for position in data:
    penguins_content += f"{position}\n"
    for player in data[position]:
        penguins_content += f"Player information: {str(player)[1:-1]}\n"
    penguins_content += '\n\n'
with open('penguins_roster_with_injuries.txt', 'w') as f:
    f.write(penguins_content)