In [2]:
import json
with open("eval_blocks.jsonl", 'r') as f:
    block_list = [json.loads(line)["text"] for line in f]



In [75]:
import pyperclip

prompt = block_list[29] + "Please generate four questions based on the provided text. The questions should represent different types, specifically factoid, list, polar (yes/no), and explanatory. Craft each question in a way that it corresponds to a single context within the text, reducing potential for multiple interpretations - use determiners where necessary for clarity. The answer should be locatable within the text. Provide the responses to these questions in an accurate and concise manner.  Format the output like this: {“question”: “The text of the question goes here”, “type”: “The type of the question goes here”, “answer”:”the text of the answer goes here”}"
# To copy text to the clipboard
pyperclip.copy(prompt)


In [76]:
from bs4 import BeautifulSoup
import re
# Open the new HTML file and parse it with BeautifulSoup
with open('test_extractor.html', 'r') as f:
    html_content = f.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find all the <p> tags
p_tags = soup.find_all('p')
# Regular expression to capture text within {}
json_pattern = re.compile(r'\{.*?\}')

# Open the results file in write mode ('w') to overwrite any existing data
with open('questions.jsonl', 'w') as outfile:
    # Iterate over each <p> tag
    for tag in p_tags:
        text = tag.get_text(strip=True)
        # Find all potential JSON objects in the text
        potential_jsons = json_pattern.findall(text)
        for potential_json in potential_jsons:
            # Try to parse the text as JSON
            try:
                data = json.loads(potential_json)
                # If successful, write the data to the file as a JSON line
                json.dump(data, outfile)
                outfile.write('\n')
            except json.JSONDecodeError:
                # If the text couldn't be parsed as JSON, ignore it
                pass



In [25]:
with open("question_block.jsonl", 'r') as f:
    with open("questions.jsonl", 'w') as fout:
        for line in f:
             if line.strip():
                fout.write(line)

In [77]:
import json

counter = 0
group_id = 1

with open('questions.jsonl', 'r') as f, open('question_block.jsonl', 'w') as fout:
    for  qid, line in enumerate(f, start=1):
        # Remove newline characters and spaces
        line = line.strip()

        # Check if line is not empty
        if line:
            # Load JSON object from line
            obj = json.loads(line)

            # Add the group id
            obj['block_id'] = group_id
            obj["question_id"] = qid
            # Write the updated JSON object to the output file
            fout.write(json.dumps(obj) + '\n')

            # Increment counter
            counter += 1

            # If counter reaches 4, reset it and increment group id
            if counter == 4:
                counter = 0
                group_id += 1



In [1]:
import torch
torch.backends.mps.enabled = True
torch.backends.mps.max_concurrency = 1
device = 'mps' if torch.backends.mps.is_available() else 'cpu'

In [3]:
from CAQA import *
import json
from dotenv import load_dotenv

# load queries from json line file
with open("question_block.jsonl", 'r') as f:
    queries = [
        json.loads(line)["question"] for line in f
    ]

load_dotenv()

llm = ["google/flan-t5-xxl",
       "mosaicml/mpt-7b-instruct",
       "tiiuae/falcon-7b-instruct",
       "tiiuae/falcon-7b",
       "bigscience/bloom-560m",
       "bigscience/bloomz",
       "lmsys/vicuna-7b-v1.3"]

embedding_model = "hkunlp/instructor-xl"

# default builder
caqa_builder = CAQABuilder()

# customized builder
customized_builder = caqa_builder.set_llm(llm[1])\
                    .set_embedding_model(embedding_model)\
                    .set_llm_params(temperature = 0.3, max_new_tokens = 250)\
                    .set_chain_type("stuff")

# build the system based on customized builder
myCAQA = customized_builder.build()
print("*******Embedding model used:  " + myCAQA.embedding_model + "*******")
print("*******Large Language Model used:  " + myCAQA.llm_repo_id + "*******")

answers = []
for query in queries:
    answer, source_docs = myCAQA.generate_response(query)
    answers.append(answer)
    print("Question: " + query)
    print(len(source_docs))
    print("Answer: " + answer + '\n')
    print("*****************")


load INSTRUCTOR_Transformer


Using embedded DuckDB with persistence: data will be stored in: /Users/wangzhuohan/Desktop/year-2/CAQA


max_seq_length  512


Downloading (…)meta_init_context.py:   0%|          | 0.00/3.64k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- meta_init_context.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)in/param_init_fns.py:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- param_init_fns.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)/custom_embedding.py:   0%|          | 0.00/305 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- custom_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- meta_init_context.py
- param_init_fns.py
- attention.py
- custom_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)model.bin.index.json:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [6]:
print(len(answers))

120

In [16]:
file_name = "answers_" + myCAQA.llm_repo_id.split('/')[-1] + "_" + myCAQA.embedding_model.split('/')[-1]  + ".jsonl"
# Write to answers to json line file
with open(file_name, 'w') as f:
    for i, answer in enumerate(answers, start=1):
        line = {
            "question_id": i,
            "answer": answer,
            "LLM" : myCAQA.llm_repo_id,
            "embedding_model" : myCAQA.embedding_model
        }
        f.write(json.dumps(line) + "\n")



answers_falcon-7b-instruct_instructor-xl.jsonl


In [None]:
with open("answers.jsonl", 'r') as asw:
    pass
with open("questions.jsonl", 'r') as qst:
    for line in qst:
        qs_dict = json.loads(line)
        q = qs_dict["question"]
        block_id = qs_dict[]
        json.loads(line)["block_id"]
with open("eval_blocks.json", 'r') as blk:
    for line in blk:
        json.loads(line)["block_id"]

In [5]:
import tiktoken
def num_tokens_from_messages(message, model="gpt-3.5-turbo-0613"):
  """Returns the number of tokens used by a list of messages."""
  try:
      encoding = tiktoken.encoding_for_model(model)
  except KeyError:
      encoding = tiktoken.get_encoding("cl100k_base")
  if model == "gpt-3.5-turbo-0613":  # note: future models may deviate from this
      num_tokens = 0
      num_tokens += 2  # every message follows <im_start>{role/name}\n{content}<im_end>\n
      num_tokens += len(encoding.encode(message))
      num_tokens += 2  # every reply is primed with <im_start>assistant
      return num_tokens
  else:
      raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.
  See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")

In [6]:
import json
with open("eval_blocks.jsonl", 'r') as f:
    total = 0
    line_num = 0
    for line in f:
        token_num = num_tokens_from_messages(json.loads(line)["text"])
        total+=token_num
        line_num += 1
    print(total//line_num)

780


In [9]:
with open('questions.jsonl', 'r') as f:
    counter = 0
    total_block = 0
    total = 0
    for line in f:

        # Check if line is not empty
        if line:
            # Load JSON object from line

            # Add the group
            # Write the updated JSON object to the output file


            # Increment counter
            counter += 1
            total += num_tokens_from_messages(line)
            # If counter reaches 4, reset it and increment group id
            if counter == 4:
                counter = 0
                total_block += 1

print(total//total_block)


124


In [7]:
print(num_tokens_from_messages("Please generate four questions based on the provided text. The questions should represent different types, specifically factoid, list, polar (yes/no), and explanatory. Make sure the question is designed in a way that only one context applies to the answer, minimizing multiple interpretations(maybe using determiner). The answer should be locatable within the text. Format the output like this: {“question”: “The text of the question goes here”, “type”: “The type of the question goes here”}"))

104


In [12]:
def estimate_cost(K=30, N=4000):
    return K*(N/5+104)*0.03/1000 +  K*125*0.06/1000

estimate_cost()

1.0386