# TL;DR – Too Long, Doctor

TL;DR is a ML model designed to synthesize and cluster scientific papers. Tailored for both students and researchers seeking to optimize their study time, TL;DR provides a tool to quickly grasp the essence of complex scientific material. Additionally, it caters to those who desire a concise summary or a preliminary overview of a paper before delving into a detailed reading.

# Importing libraries

In [1]:
# Import library to extract data from XML file
import xml.etree.ElementTree as ET
import os
import json
import re
import logging
import string

In [2]:
import torch
from transformers import BertTokenizer

In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Dataset Generation (from XML to JSON)

In [4]:
def extract_information_from_xml(xml_path):
    """
    Parameters:
    xml_path (str): Path to the XML file.

    Returns:
    dict: A dictionary containing the extracted information, with separated abstract sub-layers.
    """
    try:
        # Parse the XML file
        tree = ET.parse(xml_path)
        root = tree.getroot()

        # Initialize a dictionary to hold the extracted information
        extracted_info = {
            'Title': '',
            'Abstract': {'Simple Summary': '', 'Detailed Abstract': ''},
            'Sections': [],
            'Keywords': []
        }

        # Extract Title
        title_element = root.find('.//article-title')
        if title_element is not None:
            extracted_info['Title'] = ''.join(title_element.itertext())

        # Extract Abstracts
        abstract_element = root.find('.//abstract')
        if abstract_element is not None:
            sec_elements = abstract_element.findall('.//sec')
            for sec in sec_elements:
                section_title = ''.join(sec.find('.//title').itertext()).strip() if sec.find('.//title') is not None else ""
                section_text = ''.join(sec.itertext()).strip()
                
                # Remove the section title from the beginning of the section text
                if section_text.startswith(section_title):
                    section_text = section_text[len(section_title):].strip()
                
                if 'simple summary' in section_title.lower():
                    extracted_info['Abstract']['Simple Summary'] = section_text
                else:
                    # Append other sections to the 'Detailed Abstract', removing repeated titles if present
                    if extracted_info['Abstract']['Detailed Abstract']:
                        extracted_info['Abstract']['Detailed Abstract'] += ' ' + section_text
                    else:
                        extracted_info['Abstract']['Detailed Abstract'] = section_text
            
        # Extract Keywords
        kwd_group_elements = root.findall('.//kwd-group')
        for kwd_group in kwd_group_elements:
            keywords = [kwd.text for kwd in kwd_group.findall('.//kwd')]
            extracted_info['Keywords'].extend(keywords)

        # Extract Sections
        sections = root.findall('.//body//sec')
        for sec in sections:
            section_title_element = sec.find('.//title')
            if section_title_element is not None:
                section_title = ''.join(section_title_element.itertext())
                # Remove the title element to avoid repetition in the content
                sec.remove(section_title_element)
            else:
                section_title = "No Title"
            
            # Extracting content after removing the title
            section_content = ''.join(sec.itertext()).strip()
            extracted_info['Sections'].append({
                'Title': section_title,
                'Content': section_content
            })

        return extracted_info
    except ET.ParseError as e:
        print(f"XML parsing error: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error: {e}")
        return None

In [8]:
def extract_information_and_write_to_json_file(xml_path):
    """
    Extracts Title, Abstract, Body, and Keyword Group from scientific papers in XML format and writes the information to a JSON file.
    The JSON file will have the same name as the XML file, but with a .json extension.
    
    Parameters:
    xml_path (str): Path to the XML file.
    """
    try:
        # Use the previously defined function to extract information
        extracted_info = extract_information_from_xml(xml_path)

        target_directory = './data/json/'
        os.makedirs(target_directory, exist_ok=True)

        # Generate the output JSON file path by changing the extension
        base_filename = os.path.basename(os.path.splitext(xml_path)[0])
        output_json_path = os.path.join(target_directory, base_filename + '.json')

        # Write the dictionary to a JSON file
        with open(output_json_path, 'w', encoding='utf-8') as json_file:
            json.dump(extracted_info, json_file, ensure_ascii=False, indent=4)
        
        print(f"JSON file has been written to {output_json_path}")
    except Exception as e:
        print(f"Error during JSON file writing: {e}")

xml_directory = "./data/xml/"

# List all the XML files in the directory
xml_files = [f for f in os.listdir(xml_directory) if f.endswith('.xml')]

# Loop through each file and process it
for xml_file in xml_files:
    extract_information_and_write_to_json_file(xml_directory+xml_file)

JSON file has been written to ./data/json/PMC10000009.json
JSON file has been written to ./data/json/PMC10000035.json
JSON file has been written to ./data/json/PMC10000034.json
JSON file has been written to ./data/json/PMC10000036.json
JSON file has been written to ./data/json/PMC10000022.json
JSON file has been written to ./data/json/PMC10000023.json
JSON file has been written to ./data/json/PMC10000037.json
JSON file has been written to ./data/json/PMC10000033.json
JSON file has been written to ./data/json/PMC10000027.json
JSON file has been written to ./data/json/PMC10000026.json
JSON file has been written to ./data/json/PMC10000032.json
JSON file has been written to ./data/json/PMC10000024.json
JSON file has been written to ./data/json/PMC10000030.json
JSON file has been written to ./data/json/PMC10000031.json
JSON file has been written to ./data/json/PMC10000025.json
JSON file has been written to ./data/json/PMC10000081.json
JSON file has been written to ./data/json/PMC10000095.js

# Preprocessing (from JSON to cleaned JSON that can be given to train the model)

In [9]:
def clean_html_tags(text):
    """
    Remove HTML/XML tags from the given text.
    
    Parameters:
    - text (str): Text to clean.
    
    Returns:
    str: Text with HTML/XML tags removed.
    """
    clean_text = re.sub(r'<[^>]+>', '', text)  # Remove anything within angle brackets
    return clean_text

In [10]:
def standardize_special_characters(text):
    """
    Standardize special characters in the text, such as converting different types of quotation marks
    to a standard form, and optionally removing characters that are not beneficial for the model.
    
    Parameters:
    - text (str): Text to process.
    
    Returns:
    str: Text with standardized special characters.
    """
    # Standardize quotation marks and apostrophes
    text = text.replace('“', '"').replace('”', '"').replace("‘", "'").replace("’", "'")
    
    # Remove or replace other special characters as needed, e.g.:
    text = text.replace('—', '-')  # Replace long dashes with short ones
    
    return text

In [11]:
def replace_figures_tables_references(text):
    """
    Replace or remove references to figures and tables in the text.
    
    Parameters:
    - text (str): Text to process.
    
    Returns:
    str: Text with references to figures and tables handled.
    """
    
    # This regex targets common patterns like Figure 1, Fig. 1, Table 1, etc.
    text_without_references = re.sub(r'(Figure|Fig\.|Table)\s+\d+', '', text)
    
    return text_without_references

In [12]:
def remove_bibliography_references(text):
    """
    Remove references to the bibliography in the text, typically formatted as [1] or [1,2] or [1-3].
    
    Parameters:
    - text (str): Text to process.
    
    Returns:
    str: Text with bibliography references removed.
    """
    # This regex matches patterns like [1], [1,2], [1-3], etc.
    text_without_references = re.sub(r'\[\d+(-\d+)?(,\d+(-\d+)?)*\]', '', text)
    
    return text_without_references

In [13]:
def normalize_whitespace(text):
    """
    Normalize the whitespace in the text by replacing multiple spaces or line breaks with a single space
    and trimming leading and trailing spaces.
    
    Parameters:
    - text (str): Text to process.
    
    Returns:
    str: Text with normalized whitespace.
    """
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Trim leading and trailing spaces
    text = text.strip()
    
    return text

In [14]:
def preprocess_json(original_json_path, output_folder):
    """
    Preprocess the given JSON file by combining all sections into a single body string.
    The result is saved in a specified output folder with the same filename.
    
    Parameters:
    - original_json_path: Path to the original JSON file.
    - output_folder: Folder where the processed JSON should be saved.
    """
    # Create the output folder if it does not exist

    try:
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
            
        with open(original_json_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
    except Exception as e:
        logging.error(f"Error reading {original_json_path}: {e}")
        return None
        
    try:
        # Combine all section contents into a single string, separated by double newlines
        combined_sections = "\n\n".join([section['Content'] for section in data['Sections']])

        # Preprocessing steps
        # combined_sections = clean_html_tags(combined_sections)
        # combined_sections = standardize_special_characters(combined_sections)
        # combined_sections = replace_figures_tables_references(combined_sections)
        # combined_sections = remove_bibliography_references(combined_sections)
        # combined_sections = normalize_whitespace(combined_sections)
        combined_sections = normalize_whitespace(remove_bibliography_references(replace_figures_tables_references(standardize_special_characters(clean_html_tags(combined_sections)))))

        # Update the 'Sections' key to a single string containing all combined sections
        processed_data = {
            'Title': data['Title'],
            'Abstract': data['Abstract'],
            'Body': combined_sections
        }
        
        # Add keywords if present
        if 'Keywords' in data:
            processed_data['Keywords'] = data['Keywords']

        # Construct the output path
        output_path = os.path.join(output_folder, os.path.basename(original_json_path))
        
        # Save the processed data to the new JSON file
        with open(output_path, 'w', encoding='utf-8') as outfile:
            json.dump(processed_data, outfile, ensure_ascii=False, indent=4)

        logging.info(f"Processed file saved to {output_path}")
    except Exception as e:
        logging.error(f"Error processing {original_json_path}: {e}")
        return None
    
    return output_path

In [15]:
json_directory = "./data/json/"
json_processed_output_directory = "./data/json_processed"

# List all the JSON files in the directory
json_files = [f for f in os.listdir(json_directory) if f.endswith('.json')]

# Loop through each file and process it
for json_file in json_files:
    preprocess_json(json_directory+json_file, json_processed_output_directory)

2024-03-07 18:09:12,123 - INFO - Processed file saved to ./data/json_processed/PMC10000054.json
2024-03-07 18:09:12,125 - INFO - Processed file saved to ./data/json_processed/PMC10000111.json
2024-03-07 18:09:12,126 - INFO - Processed file saved to ./data/json_processed/PMC10000003.json
2024-03-07 18:09:12,128 - INFO - Processed file saved to ./data/json_processed/PMC10000015.json
2024-03-07 18:09:12,130 - INFO - Processed file saved to ./data/json_processed/PMC10000042.json
2024-03-07 18:09:12,134 - INFO - Processed file saved to ./data/json_processed/PMC10000107.json
2024-03-07 18:09:12,136 - INFO - Processed file saved to ./data/json_processed/PMC10000039.json
2024-03-07 18:09:12,139 - INFO - Processed file saved to ./data/json_processed/PMC10000081.json
2024-03-07 18:09:12,144 - INFO - Processed file saved to ./data/json_processed/PMC10000097.json
2024-03-07 18:09:12,148 - INFO - Processed file saved to ./data/json_processed/PMC10000078.json
2024-03-07 18:09:12,151 - INFO - Process