In [80]:
import json
import re

# Function to clean the text
def cleanse_text(text):
    unwanted_sections = ["references"]
    for section in unwanted_sections:
        pattern = r'(?i)\b{}\b.*?(?=\n\n|\Z)'.format(section)  # Case-insensitive section removal
        text = re.sub(pattern, '', text, flags=re.DOTALL)

    # Remove references to figures, tables, and page numbers
    text = re.sub(r'Figure\s*\d+', '', text)  # Remove "Figure X" references
    text = re.sub(r'Page\s*\d+\s*(of\s*\d+)?', '', text)  # Remove "Page X of Y" references
    text = re.sub(r'Table\s*\d+', '', text)  # Remove "Table X" references
    
    # You can add additional patterns to remove any other sections you don't want
    text = re.sub(r'Gates Open Research\s*\d{4},.*?Last updated:.*?\n', '', text)  # Remove repeated journal info
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = remove_leading_zeros(text)
    text = text.strip()
    text = re.sub(r'\. \.', '.', text)  # Fix any erroneous spaces between periods

    # Ensure paragraphs are maintained
    text = re.sub(r'\n\s*\n', '\n\n', text)  # Ensure paragraph breaks are preserved

    return text


# Function to remove leading zeros from numbers
def remove_leading_zeros(text):
    pattern = r'\b0+(\d+(\.\d+)?)\b'
    result = re.sub(pattern, r'\1', text)
    return result

In [81]:
# Function to fix encoding issues
def fix_encoding_issues(text):
    try:
        text = text.encode('latin1').decode('utf-8')
    except UnicodeEncodeError:
        pass  # If encoding fails, just return the original text
    return text

In [82]:
# Function to extract the title if it's the first sentence before author names

def extract_title(text):
    # This regex will capture all text before the "[version" part
    title_match = re.search(r'^(.+?)\s*\[version.*$', text, re.DOTALL)
    if title_match:
        return title_match.group(1).strip()
    return None


# Function to extract DOI
def extract_doi(text):
    # Fix encoding issues
    text = fix_encoding_issues(text)
    
    # Regex pattern to match a DOI
    match = re.search(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', text, re.IGNORECASE)
    
    if match:
        doi = match.group(0).strip()
        
        # Clean up any trailing or leading non-DOI characters
        doi = re.sub(r'[^a-zA-Z0-9./:-]+$', '', doi)
        
        return doi
    else:
        return ""

# Function to extract authors
def extract_authors(text):
    text = fix_encoding_issues(text)
    # This regex looks for a list of names separated by commas
    match = re.search(r'(?m)^\s*[A-Z][a-z]+(?: [A-Z]\.)?(?:, [A-Z][a-z]+(?: [A-Z]\.)?)*', text)
    return match.group(0).strip() if match else ""

# Function to extract FullTextURL
def extract_fulltexturl(text):
    text = fix_encoding_issues(text)  # Fix any encoding issues first
    
    # Use regex to search for a line that contains 'FulltextUrl:' followed by a valid URL
    match = re.search(r'https?://[^\s]+', text)
    
    if match:
        url = match.group(0).strip()
        
        # Remove any trailing non-URL characters (e.g., 'List')
        url = re.sub(r'[^\w:/?=&.-]+$', '', url)  # Remove trailing non-URL characters
        
        return url
    else:
        return ""

In [83]:
# Load the JSON file
with open('10.12688_gatesopenres.12751.2.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extract all text content
content = " ".join([page.get("TextContent", "") for page in data.get("Content", [])])

# Extract the required fields using the provided functions
title = extract_title(content)
doi = extract_doi(content)
full_text_url = extract_fulltexturl(content)
cleaned_content = cleanse_text(content)

# Display the extracted information
extracted_data = {
    "Title": title,
    "DOI": doi,
    "FullTextURL": full_text_url,
    "FullTextContent": cleaned_content
}

# Printing the results
print("Title:", extracted_data["Title"])
print("DOI:", extracted_data["DOI"])
print("FullTextURL:", extracted_data["FullTextURL"])
print("\nFullTextContent (first 1000 characters):\n", extracted_data["FullTextContent"][:10000])


Title: SOFTWARE TOOL ARTICLE
Point-of-contact Interactive Record Linkage (PIRL): A
software tool to prospectively link demographic surveillance
and health facility data
DOI: 10.12688/gatesopenres.12751.1
FullTextURL: https://doi.org/10.12688/gatesopenres.12751.1

FullTextContent (first 1000 characters):
 SOFTWARE TOOL ARTICLE Point-of-contact Interactive Record Linkage (PIRL): A software tool to prospectively link demographic surveillance and health facility data [version 2; peer review: 2 approved] Christopher T. Rentsch 1, Chodziwadziwa Whiteson Kabudula2, Jason Catlett3, David Beckles 4, Richard Machemba5, Baltazar Mtenga5, Nkosinathi Masilela2, Denna Michael5, Redempta Natalis6, Mark Urassa5, Jim Todd1,5, Basia Zaba 1, Georges Reniers1,2 1Department of Population Health, London School of Hygiene & Tropical Medicine, London, WC1E 7HT, UK 2MRC/Wits Rural Public Health and Health Transitions Research Unit (Agincourt), School of Public Health, Faculty of Health Sciences, University of 

In [84]:
# Save the extracted data into a PDF

from fpdf import FPDF
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", 'B', 16)
pdf.multi_cell(0, 10, extracted_data["Title"], align='C')

pdf.set_font("Arial", 'B', 12)
pdf.cell(0, 10, f"DOI: {extracted_data['DOI']}", ln=True)
pdf.cell(0, 10, f"FullTextURL: {extracted_data['FullTextURL']}", ln=True)

pdf.set_font("Arial", '', 12)
pdf.multi_cell(0, 10, extracted_data["FullTextContent"])

# Save the PDF file
pdf_file_path = "extracted_data.pdf"
pdf.output(pdf_file_path)
print(f"PDF saved successfully to: {pdf_file_path}")

PDF saved successfully to: extracted_data.pdf
