In [1]:
import os
import fitz  # PyMuPDF
import pandas as pd
import re

# Function to extract TNR values from PDF
def extract_tnr_from_pdf(pdf_path):
    tnr_values = []
    with fitz.open(pdf_path) as pdf_file:
        for page_num in range(pdf_file.page_count):
            page = pdf_file[page_num]
            text = page.get_text()
            for line in text.splitlines():
                # Adjust parsing if needed to match TNR values in the content
                if "TNR" in line:
                    tnr = parse_tnr(line)
                    if tnr is not None:
                        tnr_values.append(tnr)
    return tnr_values

# Helper function to parse a line for TNR value
def parse_tnr(line):
    try:
        # Adjust parsing as per your specific TNR format in the PDF text
        parts = line.split()
        tnr = float(parts[-1])  # Assuming TNR is the last element in the line
        return tnr
    except (IndexError, ValueError):
        return None

# Main function to process PDF files and generate CSVs
def process_pdf_directory(pdf_directory_path, output_directory_path):
    all_data = {'ion': [], 'proton': [], 'gamma': []}
    pattern = r"G4_(proton|gamma|ion)_(\d+)L\.pdf"  # Regex to match beam type and energy

    # Iterate over PDF files in the directory
    for filename in os.listdir(pdf_directory_path):
        if filename.endswith('.pdf'):
            match = re.match(pattern, filename)
            if match:
                beam_type, energy_str = match.groups()
                energy = int(energy_str)  # Convert energy to integer
                pdf_path = os.path.join(pdf_directory_path, filename)

                # Extract TNR values from the PDF
                tnr_values = extract_tnr_from_pdf(pdf_path)
                for tnr in tnr_values:
                    all_data[beam_type].append((energy, tnr))

    # Ensure output directory exists
    os.makedirs(output_directory_path, exist_ok=True)

    # Create dataframes and write to CSV in the specified output directory
    for beam_type in all_data:
        df = pd.DataFrame(all_data[beam_type], columns=['Beam Energy (MeV)', 'TNR'])
        csv_path = os.path.join(output_directory_path, f'{beam_type}.csv')
        df.to_csv(csv_path, index=False)
        print(f"Generated {csv_path}")

# Run the main process
pdf_directory_path = '/home/plewis/data/TNR_opt'  # Replace with your input PDF directory path
output_directory_path = '/home/plewis'  # Replace with your output directory path
process_pdf_directory(pdf_directory_path, output_directory_path)


Generated /home/plewis/ion.csv
Generated /home/plewis/proton.csv
Generated /home/plewis/gamma.csv
