In [1]:
import os
import re
import xml.etree.ElementTree as ET
import xml.dom.minidom as minidom

# Install required packages if not available
try:
    import PyPDF2
except ImportError:
    print("Installing PyPDF2...")
    !pip install PyPDF2
    import PyPDF2

def convert_pdf_to_xml(pdf_path, output_xml_path):
    """
    Convert a PDF research article to XML format using PyPDF2.
    
    Args:
        pdf_path (str): Path to the input PDF file
        output_xml_path (str): Path to save the output XML file
    
    Returns:
        bool: True if conversion was successful, False otherwise
    """
    try:
        print(f"Loading PDF from {pdf_path}...")
        
        # Open the PDF file
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Get number of pages
            num_pages = len(pdf_reader.pages)
            print(f"PDF has {num_pages} pages")
            
            # Create the root XML element
            root = ET.Element("article")
            
            # Add metadata
            metadata = ET.SubElement(root, "metadata")
            
            # Add document info if available
            if hasattr(pdf_reader, 'metadata') and pdf_reader.metadata:
                for key, value in pdf_reader.metadata.items():
                    if value:
                        meta_item = ET.SubElement(metadata, "meta", {"name": key})
                        meta_item.text = str(value)
            
            # Process content
            content = ET.SubElement(root, "content")
            
            # Try to extract the title from the first page
            first_page_text = pdf_reader.pages[0].extract_text()
            title_match = re.search(r'^([^\n.]+)', first_page_text)
            if title_match:
                title = ET.SubElement(content, "title")
                title.text = title_match.group(1).strip()
            
            # Try to identify abstract (often after title and before section headings)
            abstract_match = re.search(r'(?i)abstract[:\s]*([\s\S]+?)(?:\n\s*\n|\n\s*\d\.|\n\s*introduction)', first_page_text)
            if abstract_match:
                abstract = ET.SubElement(content, "abstract")
                abstract.text = abstract_match.group(1).strip()
            
            # Process all pages
            paragraphs_section = ET.SubElement(content, "paragraphs")
            
            # Simple section detection pattern (assuming sections start with numbers or common section names)
            section_pattern = r'(?:\n\s*\d\.|\n\s*(?:Introduction|Methods|Results|Discussion|Conclusion))[^\n]*\n'
            
            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                
                # Try to identify sections
                sections = re.split(section_pattern, page_text)
                section_titles = re.findall(section_pattern, page_text)
                
                if len(section_titles) > 0:
                    # Create section elements if section titles were found
                    for i, section_title in enumerate(section_titles):
                        if i < len(sections):
                            section_text = sections[i+1] if i+1 < len(sections) else ""
                            
                            # Create section element
                            section_elem = ET.SubElement(content, "section")
                            
                            # Add section heading
                            heading = ET.SubElement(section_elem, "heading")
                            heading.text = section_title.strip()
                            
                            # Split into paragraphs (assuming paragraphs are separated by blank lines)
                            paragraphs = re.split(r'\n\s*\n', section_text)
                            for para_text in paragraphs:
                                if para_text.strip():
                                    paragraph = ET.SubElement(section_elem, "paragraph")
                                    paragraph.text = para_text.strip()
                else:
                    # If no sections were found, add page text as paragraphs
                    paragraphs = re.split(r'\n\s*\n', page_text)
                    for para_text in paragraphs:
                        if para_text.strip():
                            paragraph = ET.SubElement(paragraphs_section, "paragraph")
                            paragraph.text = para_text.strip()
            
            # Try to extract references (often start with "References" heading)
            references_text = ""
            for page_num in range(num_pages):
                page_text = pdf_reader.pages[page_num].extract_text()
                if re.search(r'(?i)references|\breferences\b', page_text):
                    # Found references section
                    ref_match = re.search(r'(?i)references[:\s]*([\s\S]+)', page_text)
                    if ref_match:
                        references_text = ref_match.group(1)
                        break
            
            if references_text:
                refs_section = ET.SubElement(root, "references")
                # Split references (assuming each reference starts with a number)
                ref_items = re.split(r'\n\s*\d+\.|\[\d+\]', references_text)
                for i, ref_text in enumerate(ref_items):
                    if ref_text.strip():
                        ref_elem = ET.SubElement(refs_section, "reference", {"id": f"ref_{i+1}"})
                        ref_elem.text = ref_text.strip()
        
            # Pretty print the XML and save to file
            xml_str = ET.tostring(root, encoding='utf-8')
            xml_pretty = minidom.parseString(xml_str).toprettyxml(indent="  ")
            
            with open(output_xml_path, 'w', encoding='utf-8') as f:
                f.write(xml_pretty)
            
            print(f"Successfully converted PDF to XML. Output saved to {output_xml_path}")
            return True
    
    except Exception as e:
        print(f"Error converting PDF to XML: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

# For Jupyter Notebook compatibility - get the current directory
current_dir = os.getcwd()

# Define input and output paths
pdf_filename = 'applsci-3508831-peer-review-v1.pdf'
pdf_path = os.path.join(current_dir, pdf_filename)

# Create output filename based on input filename
output_filename = os.path.splitext(pdf_filename)[0] + '.xml'
output_path = os.path.join(current_dir, output_filename)

# Convert PDF to XML
success = convert_pdf_to_xml(pdf_path, output_path)

if success:
    print("Conversion completed successfully!")
else:
    print("Conversion failed.")

Loading PDF from C:\Courses\my-study-python\[01]_reviews\Review_Applied-Sciences_01\applsci-3508831-peer-review-v1.pdf...
PDF has 15 pages
Successfully converted PDF to XML. Output saved to C:\Courses\my-study-python\[01]_reviews\Review_Applied-Sciences_01\applsci-3508831-peer-review-v1.xml
Conversion completed successfully!
