diff --git a/pyproject.toml b/pyproject.toml index 7ecf4160..a6ef2896 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,15 +21,10 @@ pillow = "^10.3.0" langchain-mongodb = "^0.1.5" langchain-chroma = "^0.1.1" langchainhub = "^0.1.17" -torch = "2.0.0" -sentence-transformers = "^3.0.0" pypdf = "^4.2.0" -lark = "^1.1.9" ipykernel = "^6.29.5" langchain-openai = "^0.1.8" -deepeval = "^0.21.77" openai-ratelimiter = "^0.5" -ray = "^2.34.0" langchain-anthropic = "^0.1.22" db-sqlite3 = "^0.0.1" peewee = "^3.17.6" diff --git a/tools/data_generation/data_gen_prompt.py b/tools/data_generation/data_gen_prompt.py deleted file mode 100644 index 08c992a6..00000000 --- a/tools/data_generation/data_gen_prompt.py +++ /dev/null @@ -1,55 +0,0 @@ -default_prompt = """ -You are an expert at understanding financial documents for India and generating datasets. -The types of texts include PDFs from RBI, and other financial documents, and banking related books -Your task involves creating question and answer pairs that stand alone without reference to any specific documents. -These questions and answers will be used independently in future applications such as LLM evaluation and fine-tuning, -where no background document will be available. - -You must follow these rules: - -1. Direct Derivation: Answers must be directly derived from the provided content, but can involve implied information by calculating -2. Self-contained Questions: Ensure that questions are fully answered from the information given and do not imply that there is a larger document. -3. Clarity and Precision: Questions should be clear, precise, and not ambiguous. -4. Prohibited References: Explicitly avoid phrases like "according to the document", "in the text", "as mentioned in the document", or any implication of external texts. Do not construct questions that require knowledge of the document's structure or location of information within it. -5. Context Inclusion: Include the specific information from the content that supports the answer. The context should enable the answer to stand independently of any external text. -6. Sufficiency of Information: If the content lacks enough information to form a complete question-answer pair, do not force one. -7. Original Responses: Answers should be paraphrased in your own words; direct copying from the content is not allowed. -8. Include questions about financial calculations and mathematics -9. Add questions which involves numbers and numeric questions about banking & account ratio calculations. Try to have calculations in answers - -Good generated examples: -``` -[ - { - "question": "What was Airbnb's revenue in 2023?", - "answer": "$9.9 billion", - "context": "In 2023, revenue increased by 18% to $9.9 billion compared to 2022, primarily due to a 14% increase in Nights and Experiences Booked of 54.5 million combined with higher average daily rates driving a 16% increase in Gross Booking Value of $10.0 billion." - }, - { - "question": "By what percentage did Airbnb's net income increase in 2023 compared to the prior year?", - "answer": "The change in revenue if $4.8 - $2.4 billion = $2.4 billion. So the change percentage is $2.4/$4.8 * 100 = 200%", - "context": "Net income in 2023 increased to $4.8 billion for $2.4 billion, compared to the prior year, driven by our revenue growth, increased interest income, discipline in managing our cost structure, and the release of a portion of our valuation allowance on deferred tax assets of $2.9 billion." - } -] -``` - -Bad generated examples: -``` -[ - { - "answer": "Part IV hereof refers to a specific section within the document that precedes the inclusion of the consolidated financial statements.", - "context": "The term Part IV hereof in the document refers to the section that directly precedes the consolidated financial statements.", - "question": "What does Part IV hereof refer to in the context of a document layout?", - }, - { - "answer": "No, the consolidated financial statements are not presented directly within the body of the Annual Report on Form 10-K; they are incorporated by reference.", - "context": "The consolidated financial statements are incorporated by reference in the Annual Report on Form 10-K, meaning they are not presented in full directly within the documents body.", - "question": "Are the consolidated financial statements presented directly within the body of the Annual Report on form 10-K?" - } -] -``` - -NEVER mention the words "document", "text", "layout", "filing", "text", "table" in your questions or answers. -ALWAYS ensure all questions and answers are accurate, self-contained, and relevant without relying on or implying the existence of any original document or text -while strictly avoiding any fabrication or speculation. -""" \ No newline at end of file diff --git a/tools/data_generation/data_set_gen_tool.py b/tools/data_generation/data_set_gen_tool.py deleted file mode 100644 index 6bd5e01c..00000000 --- a/tools/data_generation/data_set_gen_tool.py +++ /dev/null @@ -1,39 +0,0 @@ -from typing import List - -def generate_dataset() -> List[dict]: - return [ - { - "type": "function", - "function": { - "name": 'generate_dataset', - "description": 'This function generates a list of dataset items. Each dataset item has a question, answer, and context', - "parameters": { - "type": 'object', - "properties": { - "dataset_items": { - "type": 'array', - "items": { - "type": 'object', - "properties": { - "question": { - "type": 'string', - "description": 'The generated question.', - }, - "answer": { - "type": 'string', - "description": 'The generated answer.', - }, - "context": { - "type": 'string', - "description": "The context that the question and answer are generated from.", - }, - }, - "required": ['question', 'answer', 'context'], - }, - } - }, - "required": ['dataset_items'], - }, - } - } - ] \ No newline at end of file diff --git a/tools/data_generation/financial_data_generator.py b/tools/data_generation/financial_data_generator.py deleted file mode 100644 index f5b6f8c6..00000000 --- a/tools/data_generation/financial_data_generator.py +++ /dev/null @@ -1,126 +0,0 @@ -import json -import time -import copy -import ray -import pickle -from tqdm import tqdm -from openai import OpenAI -from tools.data_generation.data_gen_prompt import default_prompt -from tools.data_generation.data_set_gen_tool import generate_dataset -from typing import List -from pydantic.main import BaseModel - -class DatasetItem(BaseModel): - question: str - answer: str - context: str - - -class Dataset(BaseModel): - items: List[DatasetItem] - -from openai import AzureOpenAI - -@ray.remote(num_cpus=0) -def chat_completion_request(model: str, messages, pidx, tools=None, tool_choice=None, max_retries=2): - client = AzureOpenAI( - api_key="", - api_version="2024-02-15-preview", - azure_endpoint="endpoint", - azure_deployment="gpt-4" - ) - retry_count = 0 - while retry_count <= max_retries: - try: - response = client.chat.completions.create( - model=model, - messages=messages, - tools=tools, - tool_choice=tool_choice, - ) - return (pidx, response) - except Exception as e: - print("Unable to generate ChatCompletion response") - print(f"Exception: {e}") - retry_count + 1 - return (pidx, "") - -class DatasetGenerator: - - def __init__(self, client: OpenAI): - self.client = client - ray.init("auto") - - def generate_from_texts( - self, - texts: List[str], - max_questions=10, - **kwargs, - ) -> Dataset: - # Get optional system prompt from kwargs - system_prompt = kwargs.get("system_prompt", default_prompt) - - # Determine how many questions to generate per text - num_texts = len(texts) - questions_per_text = max_questions // num_texts - - progress_bar = tqdm(total=max_questions, desc="Generating questions", colour='green') - - # Generate dataset items - items: List[DatasetItem] = [] - from typing import List - max_concurrent_queries = 25 - queue: List = copy.copy(texts) - start_time = time.time() - in_progress, responses = [], [] - - while queue or in_progress: - try: - if len(in_progress) < max_concurrent_queries and queue: - item = queue.pop() - in_progress.append( - chat_completion_request.remote( - model="gpt-4-turbo", - tools=generate_dataset(), - pidx=1, - tool_choice={"type": "function", "function": {"name": "generate_dataset"}}, - messages=[ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": f"Generate {questions_per_text} questions for the following block of text: {item}"} - ], - ) - ) - - ready, in_progress = ray.wait(in_progress, timeout=0.5) - #if verbose: - print( - f"# queries un-processed: {len(queue)}, in-progress: {len(in_progress)}, ready: {len(ready)}" - ) - if ready: - pdix, response = ray.get(ready)[0] - tool_call = response.choices[0].message.tool_calls[0] - if tool_call: - function_params = json.loads(tool_call.function.arguments) - dataset_items = function_params.get("dataset_items") - dataset_items = [DatasetItem(**item) for item in dataset_items] - items.extend(dataset_items) - progress_bar.update(len(dataset_items)) - except Exception as e: - print(f"Exception: {e}") - print(f"Done in {time.time() - start_time:.2f}sec.") - queue.append(item) - - # Ensure the progress bar is closed - progress_bar.close() - - dataset = Dataset( - items=items[:max_questions], - ) - - print("Total len: {}".format(len(dataset.items))) - - with open("final_checkpoint_{}.pkl".format(time.time()), 'wb') as f: - pickle.dump(dataset.items, f) - - return dataset - \ No newline at end of file diff --git a/tools/data_generation/generate.py b/tools/data_generation/generate.py deleted file mode 100644 index 594494d7..00000000 --- a/tools/data_generation/generate.py +++ /dev/null @@ -1,31 +0,0 @@ -import pickle -from openai import AzureOpenAI -from tools.data_generation.financial_data_generator import DatasetGenerator - -def get_all_file_names(folder): - from os import listdir - from os.path import isfile, join - file_paths = [folder + "/" + f for f in listdir(folder) if isfile(join(folder, f))] - return file_paths - -pk_files = get_all_file_names("bin") -print(pk_files) - - -sample_file = "bin/Elective_Paper_BIL_P.pdf.pkl" -with open(sample_file, 'rb') as f: - account_ratios = pickle.load(f) - - -client = AzureOpenAI( - api_key="< api key>", - api_version="2024-02-15-preview", - azure_endpoint="end point", - azure_deployment="gpt-4" -) -generator = DatasetGenerator(client) - -generator.generate_from_texts( - texts=account_ratios, - max_questions=len(account_ratios) * 10, -) diff --git a/tools/data_generation/generate_chunks.py b/tools/data_generation/generate_chunks.py deleted file mode 100644 index d05d904c..00000000 --- a/tools/data_generation/generate_chunks.py +++ /dev/null @@ -1,43 +0,0 @@ -from langchain_community.document_loaders import TextLoader -from langchain_text_splitters import RecursiveCharacterTextSplitter -from langchain_community.document_loaders import PyPDFLoader -from tqdm import tqdm -import pickle - -base_path = "dataset" -sample_file_path = base_path + "/Elective_Paper_BIL_P.pdf" - -def load_chunck_and_get_lines(file_path: str): - text_splitter = RecursiveCharacterTextSplitter( - # Set a really small chunk size, just to show. - chunk_size=2000, - chunk_overlap=500, - length_function=len, - is_separator_regex=False, - ) - if file_path.endswith(".pdf"): - loader = PyPDFLoader(file_path) - docs = loader.load() - else: - loader = TextLoader(file_path) - docs = loader.load() - texts = text_splitter.split_documents(docs) - return list(map(lambda x: x.page_content, texts)) - -def get_all_file_names(folder): - from os import listdir - from os.path import isfile, join - file_paths = [folder + "/" + f for f in listdir(folder) if isfile(join(folder, f))] - return file_paths - -def create_text_data(): - files = get_all_file_names(base_path) - progress_bar = tqdm(total=len(files), desc="Generating texts", colour='green') - for file_name in files: - print(file_name) - progress_bar.update(1) - lines = load_chunck_and_get_lines(file_name) - with open('{}.pkl'.format(file_name), 'wb') as f: - pickle.dump(lines, f) - -create_text_data() \ No newline at end of file diff --git a/tools/data_generation/pickle_to_csv.py b/tools/data_generation/pickle_to_csv.py deleted file mode 100644 index de3a6dd5..00000000 --- a/tools/data_generation/pickle_to_csv.py +++ /dev/null @@ -1,31 +0,0 @@ -import pickle - - -def get_all_file_names(folder): - from os import listdir - from os.path import isfile, join - file_paths = [folder + "/" + f for f in listdir(folder) if isfile(join(folder, f))] - return file_paths - -pk_files = get_all_file_names("bin/data-bank") - -extended_list = [] -for fl in pk_files: - with open(fl, 'rb') as f: - data = pickle.load(f) - extended_list.extend(list(map(lambda x: { "question": x.question, "answer": x.answer, "context": x.context }, data))) - - -print(len(extended_list)) - -from pandas import DataFrame - -df = (DataFrame(extended_list)) - -import pandas as pd -from datasets import Dataset -# Load data into a Pandas DataFrame -# df = pd.read_csv('data.csv') -# Convert the DataFrame into a Dataset -df.to_csv("ind-finance.csv") - diff --git a/tools/model_evaluation/deep_eval_generate.py b/tools/model_evaluation/deep_eval_generate.py deleted file mode 100644 index 696c8462..00000000 --- a/tools/model_evaluation/deep_eval_generate.py +++ /dev/null @@ -1,45 +0,0 @@ -from deepeval.synthesizer import Synthesizer -from deepeval.test_case import LLMTestCase -from deepeval.dataset import EvaluationDataset -from deepeval.test_case import LLMTestCaseParams -from deepeval.metrics import GEval, BiasMetric, ToxicityMetric - -from langchain_openai import ChatOpenAI - -synthesizer = Synthesizer() -synthesizer.generate_goldens_from_docs( - document_paths=['./dataset/example_1.txt'], - max_goldens_per_document=2, - include_expected_output=True -) - -llm = ChatOpenAI(temperature=0, model_name='gpt-4o-mini') -test_cases = [] - -synthesizer.save_as( - file_type='json', # or 'csv' - directory="./synthetic_data" -) - -for golden in synthesizer.synthetic_goldens: - llm_result = llm.invoke("{} \n {}".format(golden.input, "Keep the answer precise and within 300 words, everything should be in the context of India. Include specific information like dates, places or acts in needed")) - actual_output = llm_result.content - test_case = LLMTestCase( - input=golden.input, - actual_output=actual_output, - expected_output=golden.expected_output, - context=golden.context - ) - test_cases.append(test_case) - - -helpfulness_metric = GEval( - name="Helpfulness", - criteria="Helpfulness - determine if how helpful the actual output is in response with the input.", - evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT], - threshold=0.5 -) -bias_metric = BiasMetric(threshold=0.5) -toxicity_metric = ToxicityMetric(threshold=0.5) -evaluation_dataset = EvaluationDataset(test_cases=test_cases) -results = evaluation_dataset.evaluate([bias_metric, helpfulness_metric, toxicity_metric]) \ No newline at end of file