In [None]:
import os
import pdfplumber
import json

def extract_text_from_pdfs(directory, output_file):
    """Extracts text from all PDF files in the given directory and saves it to an output file.
    
    Args:
        directory (str): The path to the directory containing the PDF files.
        output_file (str): The path to the output file where the extracted text will be saved.
    
    Returns:
        str: The path to the output file.
    """
    if os.path.exists(output_file):
        print(f"Output file {output_file} already exists. Returning existing file.")
        return output_file

    with open(output_file, 'w') as outfile:  # Use 'w' to create a new file or overwrite an existing one
        for root, _, files in os.walk(directory):
            for file in files:
                if file.endswith('.pdf'):
                    file_path = os.path.join(root, file)
                    print(f"Extracting text from: {file_path}")
                    with pdfplumber.open(file_path) as pdf:
                        for page in pdf.pages:
                            text = page.extract_text()
                            if text:
                                outfile.write(text)
                                outfile.write('\n')
    return output_file

def check_string_in_file(file_path, search_string):
    """Checks if a given string is present in a file.
    
    Args:
        file_path (str): The path to the file to be searched.
        search_string (str): The string to search for in the file.
    
    Returns:
        bool: True if the string is found, False otherwise.
    """
    with open(file_path, 'r') as file:
        file_content = file.read()
        return search_string in file_content

def find_fields_in_directory(directory, field_names):
    """Finds specified fields in JSON files within a given directory.
    
    Args:
        directory (str): The path to the directory containing the JSON files.
        field_names (list): A list of field names to search for in the JSON files.
    
    Returns:
        dict: A dictionary mapping file paths to lists of found field values.
    """
    file_fields = {}
    
    for root, _, files in os.walk(directory):
        for file in files:
            if file != ".gitignore" and file != "README.md":
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    try:
                        data = json.load(f)
                        fields = []
                        for field_name in field_names:
                            extract_fields(data, field_name, fields)
                        if fields:
                            file_fields[file_path] = fields
                    except json.JSONDecodeError:
                        print(f"Error decoding JSON in file: {file_path}")
                    except UnicodeDecodeError:
                        print(f"Encoding error in file: {file_path}")
    
    return file_fields

def extract_fields(data, field_name, fields):
    """Recursively extracts specified fields from JSON data.
    
    Args:
        data (dict or list): The JSON data to search through.
        field_name (str): The name of the field to extract.
        fields (list): The list to append found field values to.
    """
    if isinstance(data, dict):
        for key, value in data.items():
            if key == field_name:
                if isinstance(value, dict):
                    for sub_key, sub_value in value.items():
                        fields.append(sub_value)
                else:
                    fields.append(value)
            else:
                extract_fields(value, field_name, fields)
    elif isinstance(data, list):
        for item in data:
            extract_fields(item, field_name, fields)

def filter_fields_not_in_file(file_fields, check_file):
    """Filters out entries where all field values are present in a given file.
    
    Args:
        file_fields (dict): A dictionary mapping file paths to lists of field values.
        check_file (str): The path to the file to check the field values against.
    
    Returns:
        dict: A filtered dictionary with entries that have at least one value not present in the file.
    """
    filtered_fields = {}
    
    for file_path, fields in file_fields.items():
        not_in_file_fields = [field for field in fields if not check_string_in_file(check_file, field)]
        if not_in_file_fields:
            filtered_fields[file_path] = not_in_file_fields
            
    return filtered_fields

In [None]:
# Extract text from PDFs and save to a file
pdf_directory = '/Users/isaacbevers/sensein/reproschema-wrapper/bridge2ai-redcap/data/instrument_pdfs'
output_file = 'parsed_redcap_pdfs.txt'
output_path = extract_text_from_pdfs(pdf_directory, output_file)
print(f"Text extracted to: {output_path}")

# Find fields in JSON files and filter against the parsed text
json_directory = '/Users/isaacbevers/sensein/reproschema-wrapper/b2aiprotocol'
field_names = ['question', 'preamble', 'name']
file_fields = find_fields_in_directory(json_directory, field_names)

check_file_path = 'parsed_redcap_pdfs.txt'
filtered_fields = filter_fields_not_in_file(file_fields, check_file_path)

filtered_fields_output = 'filtered_fields.json'
with open(filtered_fields_output, 'w', encoding='utf-8') as f:
    json.dump(filtered_fields, f, ensure_ascii=False, indent=4)

# Print the filtered fields
for file_path, fields in filtered_fields.items():
    print(f"{file_path}: {fields}")