# Filter dataset

In [3]:
import os
import json
import pandas as pd
from tqdm import tqdm

# Define the directories
json_dir = '~/Dokumente/DATEN_RAG_PM4/trainings_sets'
csv_dir = os.path.expanduser(json_dir + '/csv')  # Ensure the path is expanded to the user's home directory

# Create the CSV directory if it doesn't exist
os.makedirs(csv_dir, exist_ok=True)

# Initialize a set to hold all unique PubMed IDs across files
all_pubmed_ids = set()

# List all JSON files in the directory
json_files = [f for f in os.listdir(os.path.expanduser(json_dir)) if f.endswith('.json')]  # Ensure the path is expanded

# Loop through files with a tqdm progress bar
for json_file in tqdm(json_files, desc="Processing JSON Files"):
    json_path = os.path.join(os.path.expanduser(json_dir), json_file)
    
    # Load JSON content
    with open(json_path, 'r') as file:
        data = json.load(file)
    
    # Initialize a set for this file's PubMed IDs
    file_pubmed_ids = set()
    
    # Extract unique PubMed IDs from the 'questions' section
    for question in data.get('questions', []):
        documents = question.get('documents', [])
        for url in documents:
            pubmed_id = int(url.split('/')[-1])
            file_pubmed_ids.add(pubmed_id)
    
    # Update the set of all PubMed IDs
    all_pubmed_ids.update(file_pubmed_ids)
    
    # Save to DataFrame and CSV for this file
    df = pd.DataFrame({'pubmedid': list(file_pubmed_ids)})
    csv_filename = os.path.splitext(json_file)[0] + '.csv'
    csv_path = os.path.join(csv_dir, csv_filename)
    df.to_csv(csv_path, index=False)

# Convert the set to a list with tqdm progress
all_pubmed_ids_list = list(tqdm(all_pubmed_ids, desc="Aggregating PubMed IDs"))

# Save all PubMed IDs to a DataFrame with an extra column and save to CSV
all_pubmed_ids_df = pd.DataFrame({'pubmedid': all_pubmed_ids_list, 'enthalten_in_dataset': 0})
complete_csv_path = os.path.join(csv_dir, 'csv_complete.csv')
all_pubmed_ids_df.to_csv(complete_csv_path, index=False)


Processing JSON Files: 100%|██████████| 11/11 [00:03<00:00,  3.46it/s]
Aggregating PubMed IDs: 100%|██████████| 43188/43188 [00:00<00:00, 3379607.85it/s]


# now we flag the pubmedids which are avaible in the questions and our data subset in the mongodb


In [6]:
import pandas as pd

# Paths to your CSV files
complete_csv_path = '~/Dokumente/DATEN_RAG_PM4/trainings_sets/csv/csv_complete.csv'
rag_pubmed_csv_path = '~/Dokumente/DATEN_RAG_PM4/pubmedidsindataset/RAGPubMed.csv'
matched_ids_csv_path = '~/Dokumente/DATEN_RAG_PM4/trainings_sets/csv/matched_pubmed_ids.csv'

# Read the DataFrames
complete_df = pd.read_csv(complete_csv_path)
rag_pubmed_df = pd.read_csv(rag_pubmed_csv_path)

# Convert PMID to integer if not already
rag_pubmed_df['PMID'] = rag_pubmed_df['PMID'].astype(int)

# Check for presence and update the column
complete_df['enthalten_in_dataset'] = complete_df['pubmedid'].isin(rag_pubmed_df['PMID']).astype(int)

# Save the updated DataFrame
complete_df.to_csv(complete_csv_path, index=False)

# Extract the PubMed IDs that have a match (1 in the 'enthalten_in_dataset' column)
matched_pubmed_ids = complete_df[complete_df['enthalten_in_dataset'] == 1]['pubmedid']

# Save the matched PubMed IDs to a separate CSV file
matched_pubmed_ids.to_csv(matched_ids_csv_path, index=False, header=['pubmedid'])

# Calculate the percentage
percentage = (complete_df['enthalten_in_dataset'].sum() / len(complete_df)) * 100

print(f"Percentage of PubMed IDs with a 1: {percentage}%")



Percentage of PubMed IDs with a 1: 2.373344447531722%


now we extract each questions

In [7]:
import os
import json
import pandas as pd

# Define the directories
json_dir = '~/Dokumente/DATEN_RAG_PM4/trainings_sets'
matched_ids_csv_path = '~/Dokumente/DATEN_RAG_PM4/trainings_sets/csv/matched_pubmed_ids.csv'
output_json_path = '~/Dokumente/DATEN_RAG_PM4/trainings_sets/total/all_questions_in_mongodb.json'

# Read the matched PubMed IDs
matched_ids_df = pd.read_csv(matched_ids_csv_path)
matched_pubmed_ids = set(matched_ids_df['pubmedid'])

# List all JSON files in the directory
json_files = [f for f in os.listdir(os.path.expanduser(json_dir)) if f.endswith('.json')]

# Initialize a list to hold entries that meet the criteria
selected_entries = []

# Loop through each JSON file
for json_file in json_files:
    json_path = os.path.join(os.path.expanduser(json_dir), json_file)
    
    # Load JSON content
    with open(json_path, 'r') as file:
        data = json.load(file)
    
    # Check each entry for matched PubMed IDs
    for question in data.get('questions', []):
        documents = question.get('documents', [])
        pubmed_ids = [int(url.split('/')[-1]) for url in documents]
        # Count how many PubMed IDs in this question are in the matched list
        match_count = sum(id_ in matched_pubmed_ids for id_ in pubmed_ids)
        # If at least one (or two) match(es), save the entire entry
        if match_count >= 1:  # Change to `>= 2` if you need at least two matches
            selected_entries.append(question)

# Save the selected entries to a new JSON file
with open(os.path.expanduser(output_json_path), 'w') as file:
    json.dump({'questions': selected_entries}, file, indent=4)

# Print the count of selected entries
print(f"Total entries saved: {len(selected_entries)}")


Total entries saved: 3779


In [8]:
import os
import json

# Define the directory
json_dir = '~/Dokumente/DATEN_RAG_PM4/trainings_sets'

# Expand the user's home directory symbol
json_dir_expanded = os.path.expanduser(json_dir)

# List all JSON files in the directory except all_questions_in_mongodb.json
json_files = [f for f in os.listdir(json_dir_expanded) if f.endswith('.json') and f != 'all_questions_in_mongodb.json']

# Initialize a counter
total_entries = 0

# Loop through each JSON file and count the entries
for json_file in json_files:
    json_path = os.path.join(json_dir_expanded, json_file)
    
    # Load JSON content
    with open(json_path, 'r') as file:
        data = json.load(file)
        # Count the number of entries in the 'questions' list
        total_entries += len(data.get('questions', []))

# Print the total count of entries
print(f"Total entries in all JSON files: {total_entries}")


Total entries in all JSON files: 30212
