In [1]:
import os
import pandas as pd
import json

In [2]:
def reduce_dataframe_rows(df, n_rows):
    """
    Reduces a DataFrame to the first n_rows.

    Parameters:
    - df: The original pandas DataFrame.
    - n_rows: The number of rows to keep in the DataFrame.

    Returns:
    - A new DataFrame with only the first n_rows.
    """
    if n_rows < len(df):
        return df.iloc[:n_rows]
    else:
        print("The requested number of rows is greater than or equal to the total rows in the DataFrame.")
        return df

In [3]:
def import_data_from_csv(source_file, content_column, id_column=None, display_logs=False):
    """
    Imports and processes text data from a CSV file, organizing it into a corpus.

    Parameters:
    - source_file (str): The path to the CSV file to load the text data from.
    - content_column (str): The name of the column in the CSV that contains the text content.
    - id_column (str, optional): The name of the column that contains the unique IDs for each row. If None, indices will be used.
    - display_logs (bool): If True, displays logs about the loading and processing progress.

    Returns:
    - dict: A dictionary representing the corpus, with IDs (or indices) as keys and content as values.
    """

    # Display initial log if logging is enabled
    if display_logs:
        print(f"Initiating import from CSV: {source_file}")

    # Attempt to read the CSV file into a DataFrame
    try:
        dataframe = pd.read_csv(source_file)
        if display_logs:
            print(f'Successfully imported {len(dataframe)} records from the CSV file.')
    except Exception as e:
        raise IOError(f"Failed to load data from the CSV file: {e}")

    # Reduce the DataFrame to a manageable size, just having 300 rows, original
    # test data has 2889 records, Please note that this is optional
    # NOTE: Because of the token limit, and be conservative, the number of rows is reduced to 40
    # You should have a larger number of rows in the original validation dataset
    dataframe = reduce_dataframe_rows(dataframe, 40)

    # Validate that the content column exists
    if content_column not in dataframe.columns:
        raise ValueError(f"The specified content column '{content_column}' does not exist in the CSV file.")

    # Use the specified id_column or default to the DataFrame index
    if id_column and id_column in dataframe.columns:
        ids = dataframe[id_column]
    else:
        if display_logs and id_column:
            print(f"Specified ID column '{id_column}' not found. Defaulting to DataFrame indices.")
        ids = dataframe.index

    # Construct the corpus from the DataFrame
    corpus_content = {row_id: row_content for row_id, row_content in zip(ids, dataframe[content_column])}

    return corpus_content

In [None]:
print(os.getcwd())

In [None]:
file_path = os.path.join(os.getcwd(), 'medical_tc_test.csv')
test_corpus = import_data_from_csv(source_file=file_path,
                                   content_column='medical_abstract',
                                   display_logs=True)
test_corpus

In [6]:
len(test_corpus)

40

In [7]:
import os
import json

# Define the base directory path.
base_dir = os.getcwd()

# Path to the directory where the processed corpus will be stored.
processed_dir = os.path.join(base_dir, 'processed')

# Ensure the directory for processed corpus exists.
if not os.path.exists(processed_dir):
    os.makedirs(processed_dir)  # os.makedirs can create intermediate directories if needed.

# Path to the JSON file within the processed directory.
PROCESSED_CORPUS_FPATH = os.path.join(processed_dir, 'data_bank.json')

# Write the `test_corpus` dictionary to the JSON file.
with open(PROCESSED_CORPUS_FPATH, 'w') as f:
    json.dump(test_corpus, f)

In [8]:
import re
import uuid

from llama_index.llms import openai
from llama_index.core.schema import MetadataMode
from tqdm.notebook import tqdm

In [9]:
# Define the base directory path.
base_dir = os.getcwd()
# Path to the directory where the processed corpus will be stored.
processed_dir = os.path.join(base_dir, 'processed')

TEST_QUERIES_FPATH = os.path.join(processed_dir, 'search_terms.json')
TEST_RELEVANT_DOCS_FPATH = os.path.join(processed_dir, 'pertinent_documents.json')
TEST_ANSWERS_FPATH = os.path.join(processed_dir, 'responses.json')

In [10]:
with open(PROCESSED_CORPUS_FPATH, 'r+') as f:
    test_corpus = json.load(f)

In [11]:
# Creating a corpus of text
test_corpus = {k: test_corpus[k] for k in list(test_corpus.keys())}
test_corpus

{'0': 'Obstructive sleep apnea following topical oropharyngeal anesthesia in loud snorers. Previous studies support the presence of an upper airway reflex mechanism that contributes to the maintenance of upper airway patency during sleep. We investigated the possibility that interference with this reflex mechanism contributes to the development of obstructive sleep apnea. Eight otherwise asymptomatic snorers (seven male and one female), age 39 +/- 5.3 yr (mean +/- SEM), underwent overnight sleep studies on three successive nights. An acclimatization night was followed by two study nights randomly assigned to control (C) and oropharyngeal anesthesia (OPA). On the OPA night topical anesthesia was induced using 10% lidocaine spray and 0.25% bupivacaine gargle. A saline placebo was used on night C. All subjects slept well on both study nights (mean sleep duration was 6.2 h on both study nights), and sleep stage distribution was similar on both nights. Obstructive apneas and hypopneas (OAH)

In [12]:
import uuid
import re

def fetch_qa_pairs_from_corpus(data_corpus, questions_per_section=2, custom_prompt=None, display_progress=False):
    """
    This function dynamically generates question-answer pairs based on a provided corpus.
    Each piece of text from the corpus is used to formulate questions that are then answered,
    simulating a quiz or test preparation scenario.
    """
    artificial_intelligence = openai.OpenAI(model='gpt-3.5-turbo')
    default_prompt = custom_prompt or """\
    Below is the context for generating questions and answers.

    ---------------------
    {context}
    ---------------------

    Given the context above without using external information,
    develop {questions_per_section} question(s) with their brief answer(s),
    suitable for a quiz or examination. Keep answers concise, within 1-50 words. 
    Ensure the generated content varies and aligns closely with the provided context."
    
    """

    question_bank = {}
    answer_key = {}
    document_references = {}

    for doc_id, context in tqdm(data_corpus.items(), disable=not display_progress):
        dynamic_prompt = default_prompt.format(context=context, questions_per_section=questions_per_section)
        
        try:
            ai_response = artificial_intelligence.complete(dynamic_prompt)
        except Exception as e:
            print(f"Failed to generate response for document ID {doc_id}: {e}")
            continue

        processed_response = str(ai_response).strip().split("\n")
        paired_qa = zip(processed_response[0::2], processed_response[1::2])

        for q, a in paired_qa:
            q = re.sub(r"^\d+[\).\s]", "", q).strip()
            if q and a:  # Ensures both question and answer are not empty
                unique_id = str(uuid.uuid4())
                q = q.replace("Question:", "").strip()
                question_bank[unique_id] = q
                a = a.replace("Answer:", "").strip()
                answer_key[unique_id] = a
                document_references[unique_id] = [doc_id]

    return question_bank, answer_key, document_references


In [13]:
test_queries, test_answers, test_relevant_docs = fetch_qa_pairs_from_corpus(
    test_corpus,
    questions_per_section=1,
    display_progress=True,
)

  0%|          | 0/40 [00:00<?, ?it/s]

In [14]:
with open(TEST_QUERIES_FPATH, 'w+') as f:
    json.dump(test_queries, f)

with open(TEST_ANSWERS_FPATH, 'w+') as f:
    json.dump(test_answers, f)

with open(TEST_RELEVANT_DOCS_FPATH, 'w+') as f:
    json.dump(test_relevant_docs, f)

In [15]:
# Path to the JSON file within the processed directory.
TEST_DATASET_FPATH = os.path.join(processed_dir, 'data.json')

In [16]:
test_dataset = {
    'search_terms': test_queries,
    'responses': test_answers,
    'data_bank': test_corpus,
    'pertinent_documents': test_relevant_docs,
}

In [17]:
if os.path.exists(TEST_DATASET_FPATH):
    os.remove(TEST_DATASET_FPATH)
with open(TEST_DATASET_FPATH, 'w+') as f:
    json.dump(test_dataset, f)