# Filter dataset

first we loop trough each training set for example BioASQ-trainingDataset2b.json and extract the pubmed IDS used to answers questions 

In [1]:
import os
import json
import pandas as pd
from tqdm import tqdm

# Define the directories
json_dir = '~/Questions_answers_data/DATEN_RAG_PM4/trainings_sets'
csv_dir = os.path.expanduser(json_dir + '/csv')  # Ensure the path is expanded to the user's home directory

# Create the CSV directory if it doesn't exist
os.makedirs(csv_dir, exist_ok=True)

# Initialize a set to hold all unique PubMed IDs across files
all_pubmed_ids = set()

# List all JSON files in the directory
json_files = [f for f in os.listdir(os.path.expanduser(json_dir)) if f.endswith('.json')]  # Ensure the path is expanded

# Loop through files with a tqdm progress bar
for json_file in tqdm(json_files, desc="Processing JSON Files"):
    json_path = os.path.join(os.path.expanduser(json_dir), json_file)

    # Load JSON content
    with open(json_path, 'r') as file:
        data = json.load(file)

    # Initialize a set for this file's PubMed IDs
    file_pubmed_ids = set()

    # Extract unique PubMed IDs from the 'questions' section
    for question in data.get('questions', []):
        documents = question.get('documents', [])
        for url in documents:
            pubmed_id = int(url.split('/')[-1])
            file_pubmed_ids.add(pubmed_id)

    # Update the set of all PubMed IDs, since a set can only contain unique numbers the same  PUBMEDIDS wont be stored twice
    all_pubmed_ids.update(file_pubmed_ids)

    # Save to DataFrame and CSV for this file
    df = pd.DataFrame({'pubmedid': list(file_pubmed_ids)})
    csv_filename = os.path.splitext(json_file)[0] + '.csv'
    csv_path = os.path.join(csv_dir, csv_filename)
    df.to_csv(csv_path, index=False)

# Convert the set to a list with tqdm progress
all_pubmed_ids_list = list(tqdm(all_pubmed_ids, desc="Aggregating PubMed IDs"))

# Save all PubMed IDs to a DataFrame with an extra column and save to CSV
all_pubmed_ids_df = pd.DataFrame({'pubmedid': all_pubmed_ids_list, 'enthalten_in_dataset': 0})
complete_csv_path = os.path.join(csv_dir, 'csv_complete.csv')
all_pubmed_ids_df.to_csv(complete_csv_path, index=False)

Processing JSON Files: 100%|██████████| 11/11 [00:09<00:00,  1.21it/s]
Aggregating PubMed IDs: 100%|██████████| 43188/43188 [00:00<00:00, 3358304.77it/s]


now we read in all the pubmedids we currently have loaded into our 

this script reads in the before created list of all pubmed ids used to answer questions and all the pubmed ids used in our dataset

it then flags all  the pubmedids which are avaible in the questions and our data subset used (remember we created the latter in the previous stepp)

in the end we first update the csv_complete.csv on wether or not the pubmedid is containted flag afterwards we save the matched pubmed ids into a seperate file.


In [4]:
import pandas as pd

# Paths to your CSV files
complete_csv_path = '~/Questions_answers_data/DATEN_RAG_PM4/trainings_sets/csv/csv_complete.csv' #csv file with all unique pubmed ids that are used to answer questions
rag_pubmed_csv_path = '~/data/faiss_indices/bioBERT/PMIDs/concatenated_pubmed_ids.csv' #csv file with all the pubmedids currently in our system
matched_ids_csv_path = '~/Questions_answers_data/DATEN_RAG_PM4/trainings_sets/csv/matched_pubmed_ids.csv' # csv file that results from running this script which containes alll the pubmed ids that 
# should be able to answer questions 

# Read the DataFrames
complete_df = pd.read_csv(complete_csv_path)
# Read the RAGPubMed.csv file assuming it has no header and only one column of integers
rag_pubmed_df = pd.read_csv(rag_pubmed_csv_path, header=None, names=['PMID'], dtype={'PMID': int})


# Check for presence and update the column
complete_df['enthalten_in_dataset'] = complete_df['pubmedid'].isin(rag_pubmed_df['PMID']).astype(int)

# Save the updated DataFrame
complete_df.to_csv(complete_csv_path, index=False)

# Extract the PubMed IDs that have a match (1 in the 'enthalten_in_dataset' column)
matched_pubmed_ids = complete_df[complete_df['enthalten_in_dataset'] == 1]['pubmedid']

# Save the matched PubMed IDs to a separate CSV file
matched_pubmed_ids.to_csv(matched_ids_csv_path, index=False, header=['pubmedid'])

# Calculate the percentage
percentage = (complete_df['enthalten_in_dataset'].sum() / len(complete_df)) * 100

print(f"Percentage of PubMed IDs with a 1: {percentage}%")



Percentage of PubMed IDs with a 1: 3.035565434843012%


now we extract each questions that has at least one pubmed id as answer which is present in our dataset and save it into the json file. we also provide the count of how many questions should be answerable 

In [5]:
import os
import json
import pandas as pd

# Define the directories
json_dir = '~/Questions_answers_data/DATEN_RAG_PM4/trainings_sets'
matched_ids_csv_path = '~/Questions_answers_data/DATEN_RAG_PM4/trainings_sets/csv/matched_pubmed_ids.csv'
output_json_path = '~/Questions_answers_data/all_questions_in_system.json'

# Read the matched PubMed IDs
matched_ids_df = pd.read_csv(matched_ids_csv_path)
matched_pubmed_ids = set(matched_ids_df['pubmedid'])

# List all JSON files in the directory
json_files = [f for f in os.listdir(os.path.expanduser(json_dir)) if f.endswith('.json')]

# Initialize a list to hold entries that meet the criteria and a counter for all entries
selected_entries = []
total_entries = 0

# Loop through each JSON file
for json_file in json_files:
    json_path = os.path.join(os.path.expanduser(json_dir), json_file)
    
    # Load JSON content
    with open(json_path, 'r') as file:
        data = json.load(file)
    
    # Increment total_entries by the number of questions in the current file
    total_entries += len(data.get('questions', []))
    
    # Check each entry for matched PubMed IDs
    for question in data.get('questions', []):
        documents = question.get('documents', [])
        pubmed_ids = [int(url.split('/')[-1]) for url in documents]
        # Count how many PubMed IDs in this question are in the matched list
        match_count = sum(id_ in matched_pubmed_ids for id_ in pubmed_ids)
        # If at least one (or two) match(es), save the entire entry
        if match_count >= 1:  # Change to `>= 2` if you need at least two matches
            selected_entries.append(question)

# Save the selected entries to a new JSON file
with open(os.path.expanduser(output_json_path), 'w') as file:
    json.dump({'questions': selected_entries}, file, indent=4)

# Print the total count of entries and the count of selected entries
print(f"Total entries in all JSON files: {total_entries}")
print(f"Total selected entries saved: {len(selected_entries)}")


FileNotFoundError: [Errno 2] No such file or directory: '/home/ubuntu/Questions_answers_data/total/all_questions_in_system.json'