In [None]:
import pandas as pd
from datasets import load_dataset
!wget https://raw.githubusercontent.com/NExTplusplus/TAT-QA/master/dataset_raw/tatqa_dataset_train.json -O tatqa_dataset_train.json
!wget https://raw.githubusercontent.com/NExTplusplus/TAT-QA/master/dataset_raw/tatqa_dataset_dev.json -O tatqa_dataset_dev.json
!wget https://raw.githubusercontent.com/NExTplusplus/TAT-QA/master/dataset_raw/tatqa_dataset_test.json -O tatqa_dataset_test.json
!wget https://raw.githubusercontent.com/NExTplusplus/TAT-QA/master/dataset_raw/tatqa_dataset_test_gold.json -O tatqa_dataset_test_gold.json


tatqa_train_df = pd.read_json('tatqa_dataset_train.json')
tatqa_dev_df = pd.read_json('tatqa_dataset_dev.json')
tatqa_test_df = pd.read_json('tatqa_dataset_test.json')

# ------------------------------------fingpt-convfinQA------------------------------------

dataset = load_dataset('FinGPT/fingpt-convfinqa')
convfinqa_train_dataset = dataset['train']
convfinqa_test_dataset = dataset['test']
convfinqa_train_df = pd.DataFrame(convfinqa_train_dataset)
convfinqa_test_df = pd.DataFrame(convfinqa_test_dataset)

# ------------------------------------ fin-qa------------------------------------

!wget https://raw.githubusercontent.com/czyssrs/FinQA/main/dataset/train.json -O finqa_dataset_train.json
!wget https://raw.githubusercontent.com/czyssrs/FinQA/main/dataset/dev.json -O finqa_dataset_dev.json
!wget https://raw.githubusercontent.com/czyssrs/FinQA/main/dataset/test.json -O finqa_dataset_test.json


# Load the JSON files into DataFrames
finqa_train_df = pd.read_json('finqa_dataset_train.json')
finqa_dev_df = pd.read_json('finqa_dataset_dev.json')
finqa_test_df = pd.read_json('finqa_dataset_test.json')

# ------------------------------------rel-ex-------------------------------------

dataset = load_dataset('FinGPT/fingpt-finred-re')
relext_train_dataset = dataset['train']
relext_test_dataset = dataset['test']
relext_train_df = pd.DataFrame(relext_train_dataset)
relext_test_df = pd.DataFrame(relext_test_dataset)


def preprocess_tat_qa(df, is_test=False):
    processed_data = []
    total_samples = len(df)
    # Determine the number of samples to include extra fields in (60% of the total samples)
    for idx, (_, row) in enumerate(df.iterrows()):
        # Combine all paragraphs into a single text
        context_paragraphs = " ".join([para['text'] for para in row['paragraphs']])
        # Format table into a readable string
        table_str = "\n".join(["\t".join(map(str, row)) for row in row['table']['table']])
        context = f"{context_paragraphs}\nTable:\n{table_str}"
        extra_text = ""
        for question in row['questions']:
            question_text = question['question']
            answer_text = "unknown" if is_test else ", ".join(question['answer']) if isinstance(question['answer'], list) else str(question['answer'])
            processed_data.append({'context': context, 'question': question_text, 'answer': answer_text})

    return processed_data

# tat_qa_processed = preprocess_tat_qa(tatqa_train_df)

def preprocess_convfinqa(df):
    processed_data = []
    for _, row in df.iterrows():
        context = row['input']
        question = row['instruction'] + "Answer the last question in the given context, if history is provided for previous questions-answers use that information (if required based on the context) to get the answer for the last question"
        answer = str(row['output'])
        processed_data.append({'context': context, 'question': question, 'answer': answer})
    return processed_data


def preprocess_finqa(df):
    processed_data = []
    total_samples = len(df)
    for idx, (_, row) in enumerate(df.iterrows()):
        pre_text = " ".join(row['pre_text'])
        post_text = " ".join(row['post_text'])
        table_str = "\n".join(["\t".join(map(str, table_row)) for table_row in row['table']])
        context = f"{pre_text}\n\n{post_text}\n\nTable:\n{table_str}"
        question = row['qa']['question']
        answer = str(row['qa']['answer'])

        processed_data.append({'context': context, 'question': question, 'answer': answer})

    return processed_data

def preprocess_relation_extraction(df):
    processed_data = []
    for _, row in df.iterrows():
        context = row['input']
        question = row['instruction']
        answer = row['output']
        processed_data.append({'context': context, 'question': question, 'answer': answer})
    return processed_data

tat_qa_processed = preprocess_tat_qa(tatqa_train_df)
convfinqa_processed = preprocess_convfinqa(convfinqa_train_df)
finqa_processed = preprocess_finqa(finqa_train_df)
relation_extraction_processed = preprocess_relation_extraction(relext_train_df)

# Combine all processed data
combined_data = tat_qa_processed + convfinqa_processed + finqa_processed + relation_extraction_processed

In [5]:
import json
file_path = "merged_data.json"
with open(file_path, 'w') as json_file:
    json.dump(combined_data, json_file, indent=4)