In [1]:
# Import libraries
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
import nltk
from scispacy.linking import EntityLinker
import scispacy
from tqdm import tqdm
import re
import unicodedata
import os
import logging
import chardet
from scipy.spatial.distance import cosine

%matplotlib inline
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm
Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


In [2]:
# nltk.download('punkt')
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

nlp = spacy.load("en_core_sci_md")
nlp.add_pipe("scispacy_linker", last=True)

print("Setup completed.")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Setup completed.


In [3]:
def reassemble_hyphenated_words(text):
    return re.sub(r'(\w+)-\s*\n(\w+)', r'\1\2', text)

def remove_figures_tables(text):
    return re.sub(r'\b(figures?|tables?)\b', '', text)

def remove_numerical_references(text):
    return re.sub(r'\[\d+\]', '', text)

def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def remove_emails(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    return re.sub(email_pattern, '', text, flags=re.IGNORECASE)

def remove_citations(text):
    patterns = [
        r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\s+[A-Z][a-z]+\s+et\s+al\.',
        r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\s+et\s+al\.',
        r'\b[A-Z][a-z]+\s+et\s+al\.',
        r'\(.*?et al\..*?\d{4}.*?\)',
        r'\[.*?\]',
        r'\(\d{4}[a-z]?(?:,\s*\d{4}[a-z]?)*\)',
        r'^.*?\d{4};.*?:\s*\d+.*?$',
        r'^.*?\d{4};.*?:\s*\d+.*?$',  # Matches journal info like "2024;258: 119– 129"
    ]

    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE | re.IGNORECASE)
    return text

def remove_headers(text):
    # Remove lines that are all uppercase and end with a colon
    text = re.sub(r'^[A-Z\s]+:$', '', text, flags=re.MULTILINE)
    # Remove lines that start with bullet points
    text = re.sub(r'^\s*•.*$', '', text, flags=re.MULTILINE)
    return text

def remove_metadata(text):
    # Remove headers, copyright info, DOI, received/accepted dates
    patterns = [
        r'^.*?©Copyright.*$',
        r'^DOI:.*$',
        r'^Received:.*$',
        r'^Accepted:.*$',
        r'^Address for Correspondence:.*$',
        r'^E-mail:.*$',
        r'^ORCID-ID:.*$',
        r'^\s*\d+\s*$',  # Page numbers
        r'^.*?ORCID:.*$',
        r'^Cite this article as:.*$',
        r'\[\s*[^\w\s]*\s*\]'
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return text

def remove_institution_names(text):
    # Remove institution names (this is a simplified approach and may need refinement)
    pattern = r'\*+[A-Z][A-Za-z\s,]+(University|Institute|Hospital|Clinic|Department|Faculty)[^\n]*'
    return re.sub(pattern, '', text, flags=re.MULTILINE)

def remove_copyright_info(text):
    patterns = [
        r'^©.*$',
        r'Copyright.*$',
        r'This is an open access article.*$',
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE | re.IGNORECASE)
    return text

def remove_doi_and_journal_info(text):
    patterns = [
        r'DOI:.*$',
        r'^.*?\d{4};\d+:\d+–\d+',  # Matches journal info like "2024;258: 119– 129"
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return text

def remove_references(text):
    # Remove lines that look like references
    text = re.sub(r'^.*?\d{4};.*?:\s*\d+.*?$', '', text, flags=re.MULTILINE)
    return text

def remove_artifacts(text):
    # Remove license and DOI information
    text = re.sub(r'BY license \(.*?\)\..*?commons\.org/licenses/by/\d\.\d/\s*\)\.', '', text)
    text = re.sub(r'://doi\.org/\d+\.\d+/[^\s]+', '', text)
    text = re.sub(r'://creativecommons\.org/licenses/by/\d\.\d/', '', text)
    text = re.sub(r'://creativecommons\.org/licenses/by-\w+/\d\.\d/', '', text)
    
    
    # Remove unnecessary symbols
    text = re.sub(r'[⁎\]]', '', text)
    
    # Remove empty parentheses and brackets
    text = re.sub(r'\(\s*\)|\[\s*\]', '', text)
    
    # Remove isolated semicolons
    text = re.sub(r'\s*;\s*', ' ', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

In [4]:
def detect_sentence_boundaries(text):
    """
    Detect sentence boundaries using spaCy.
    """
    doc = nlp(text)
    spacy_sentences = [sent.text for sent in doc.sents]
    return ' '.join(spacy_sentences)

In [5]:
def tokenize_text(text):
    """
    Tokenize the text using spaCy.
    """
    doc = nlp(text)
    return [token.text for token in doc]

In [6]:
def preprocess_text(text, tokenize=False):
    try:

        print(f"Starting preprocessing. Initial text length: {len(text)}")

        # Step 1: Initial text cleaning
        text = reassemble_hyphenated_words(text)
        print(f"After reassemble_hyphenated_words: {len(text)} chars")

        text = remove_figures_tables(text)
        print(f"After remove_figures_tables: {len(text)} chars")

        text = remove_citations(text)
        print(f"After remove_citations: {len(text)} chars")

        text = remove_urls(text)
        print(f"After remove_urls: {len(text)} chars")

        text = remove_emails(text)
        print(f"After remove_emails: {len(text)} chars")

        text = remove_numerical_references(text)
        print(f"After remove_numerical_references: {len(text)} chars")

        text = remove_headers(text)
        print(f"After remove_headers: {len(text)} chars")

        text = remove_references(text)
        print(f"After remove_references: {len(text)} chars")

        text = remove_metadata(text)
        print(f"After remove_metadata: {len(text)} chars")

        text = remove_institution_names(text)
        print(f"After remove_institution_names: {len(text)} chars")
        
        text = remove_copyright_info(text)
        print(f"After remove_copyright_info: {len(text)} chars")

        text = remove_doi_and_journal_info(text)
        print(f"After remove_doi_and_journal_info: {len(text)} chars")

        text = remove_artifacts(text)
        print(f"After remove_artifacts: {len(text)} chars")

        # Step 2: Sentence boundary detection
        text = detect_sentence_boundaries(text)
        print(f"After detect_sentence_boundaries: {len(text)} chars")

        # Remove extra whitespace
        text = ' '.join(text.split())
        print(f"After removing extra whitespace: {len(text)} chars")
    
        if tokenize:
            tokens = tokenize_text(text)
            print(f"After tokenization: {len(tokens)} tokens")
            return tokens
        else:
            print("Text preprocessing complete.")
            return text.strip()

    except Exception as e:
        logging.error(f"Error in preprocess_text: {e}")
        print(f"Error in preprocess_text: {e}")
        return ""

In [7]:
def count_words(text):
    """Count the number of words in the given text."""
    return len(re.findall(r'\w+', text))

In [8]:
def split_document(text):
    # Process the document with spaCy
    doc = nlp(text)
    # Split the document into paragraphs
    paragraphs = [para.text for para in doc.sents]
    return paragraphs

In [9]:
def calculate_similarity(para, target):
    # Convert the paragraph and target text to spaCy tokens
    para_doc = nlp(para)
    target_doc = nlp(target)
    # Calculate the cosine similarity between the embeddings
    similarity = para_doc.similarity(target_doc)
    return similarity


In [10]:
def detect_abstract_section(paragraphs, start_targets=["Abstract", "Background", "Methods", "Results", "Conclusion"], end_targets=["Keywords", "Introduction"], threshold=0.9):
    start_index = None
    end_index = None
    min_abstract_length = 120  # Minimum number of words in the abstract
    max_start_search = 3  # Limit the number of paragraphs to consider if no heading is found

    for i, para in enumerate(paragraphs):
        # Check for any of the start targets
        for start_target in start_targets:
            start_similarity = calculate_similarity(para, start_target)
            if start_similarity > threshold and start_index is None:
                start_index = i
                print(f"Start of abstract detected at paragraph {i}: {para[:30]} with start target '{start_target}'...")
                break  # Found the start, no need to check other start targets

        # If no heading is found within the first few paragraphs, assume the first paragraph is the start
        if start_index is None and i < max_start_search:
            start_index = 0
            print(f"No start heading detected, assuming paragraph 0 as the start of the abstract.")
            break

        # Once start is detected, look for the end target
        if start_index is not None and i >= start_index:
            for end_target in end_targets:
                end_similarity = calculate_similarity(para, end_target)
                if end_similarity > threshold:
                    end_index = i
                    print(f"End of abstract detected at paragraph {i}: {para[:30]} with end target '{end_target}'...")
                    break  # Found the end, no need to check other end targets
            if end_index is not None:
                break

    # Extract and validate the abstract
    if start_index is not None and end_index is not None:
        abstract = "\n".join(paragraphs[start_index:end_index])
        # Check if the abstract is too short
        if count_words(abstract) < min_abstract_length:
            print("Detected abstract is too short. Rechecking...")
            abstract = ""  # Discard or handle short abstracts
    elif start_index is not None:
        # If no end target is found, but start is identified, take a few paragraphs as abstract
        abstract = "\n".join(paragraphs[start_index:start_index + max_start_search])
        print(f"Taking first {max_start_search} paragraphs as abstract since no end target found.")
    else:
        abstract = ""  # No abstract found
    return abstract


In [11]:
def extract_abstract_from_document(text):
    # Preprocess the text
    cleaned_text = preprocess_text(text)
    # Split the document into paragraphs
    paragraphs = split_document(cleaned_text)
    # Detect and extract the abstract section
    abstract = detect_abstract_section(paragraphs)
    return abstract

In [12]:
def process_files(input_folder_path: str, output_folder_path: str, batch_size: int = 10, tokenize=False) -> None:
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)
    
    files = [f for f in os.listdir(input_folder_path) if f.endswith('.txt')]
    total_files = len(files)
    
    for i in range(0, total_files, batch_size):
        batch = files[i:i+batch_size]
        print(f"\nProcessing batch {i//batch_size + 1} of {(total_files-1)//batch_size + 1}")
        
        for filename in tqdm(batch, desc=f"Batch {i//batch_size + 1}"):
            input_file_path = os.path.join(input_folder_path, filename)
            output_file_path = os.path.join(output_folder_path, f"preprocessed_{filename}")
            abstract_output_path = os.path.join(output_folder_path, f"abstract_{filename}")
            
            print(f"\nProcessing file: {filename}")
            
            try:
                # Detect file encoding
                with open(input_file_path, 'rb') as file:
                    raw_data = file.read()
                    result = chardet.detect(raw_data)
                    detected_encoding = result['encoding']
                    confidence = result['confidence']
                
                print(f"Detected encoding: {detected_encoding} (confidence: {confidence:.2f})")
                
                # Try reading with detected encoding
                try:
                    with open(input_file_path, 'r', encoding=detected_encoding) as file:
                        original_text = file.read()
                    print(f"Successfully read file with {detected_encoding} encoding.")
                except UnicodeDecodeError:
                    print(f"Failed to read with {detected_encoding}. Trying UTF-8...")
                    with open(input_file_path, 'r', encoding='utf-8') as file:
                        original_text = file.read()
                    print("Successfully read file with UTF-8 encoding.")
                
                original_word_count = count_words(original_text)
                print(f"Original word count: {original_word_count}")
                
                cleaned_text = preprocess_text(original_text, tokenize=tokenize)
                
                # Extract the abstract
                abstract = extract_abstract_from_document(original_text)
                print(f"Extracted abstract: {len(abstract)} characters")

                if tokenize:
                    cleaned_word_count = len(cleaned_text)  # cleaned_text is now a list of tokens
                    # Save tokens, one per line
                    with open(output_file_path, 'w', encoding='utf-8') as file:
                        file.write('\n'.join(cleaned_text))
                else:
                    cleaned_word_count = count_words(cleaned_text)
                    with open(output_file_path, 'w', encoding='utf-8') as file:
                        file.write(cleaned_text)
                
                # Save the abstract to a separate file
                with open(abstract_output_path, 'w', encoding='utf-8') as file:
                    file.write(abstract)
                
                print(f"Abstract saved to: {abstract_output_path}")
                print(f"Cleaned text saved to: {output_file_path}")

                # Calculate and print the percentage of cleaned text
                if original_word_count > 0:
                    percentage_retained = (cleaned_word_count / original_word_count) * 100
                    print(f"Percentage of words retained after cleaning: {percentage_retained:.2f}%")
                else:
                    print("Original text contains no words; cannot calculate percentage.")
            
            except FileNotFoundError:
                print(f"File not found: {input_file_path}")
            except Exception as e:
                print(f"Error processing file {filename}: {str(e)}")
            
            print("=" * 100)
        
        print(f"Completed processing batch {i//batch_size + 1}")
    
    print("All batches processed.")


In [14]:
if __name__ == "__main__":
    input_folder = "/mnt/data/skanda/MSc_IRD_LLM/data/data_text_sample"
    output_folder = "/mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed"
    abstract_folder = "/mnt/data/skanda/MSc_IRD_LLM/data/abstracts"
    process_files(input_folder, output_folder, batch_size=10, tokenize=False)
    print("Preprocessing complete.")


Processing batch 1 of 1


Batch 1:   0%|          | 0/10 [00:00<?, ?it/s]


Processing file: 13287_2023_Article_3526.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 7885
Starting preprocessing. Initial text length: 47767
After reassemble_hyphenated_words: 47713 chars
After remove_figures_tables: 47713 chars
After remove_citations: 43117 chars
After remove_urls: 42951 chars
After remove_emails: 42929 chars
After remove_numerical_references: 42929 chars
After remove_headers: 42920 chars
After remove_references: 42920 chars
After remove_metadata: 42866 chars
After remove_institution_names: 42866 chars
After remove_copyright_info: 42696 chars
After remove_doi_and_journal_info: 42696 chars
After remove_artifacts: 41770 chars
After detect_sentence_boundaries: 41773 chars
After removing extra whitespace: 41773 chars
Text preprocessing complete.
Starting preprocessing. Initial text length: 47767
After reassemble_hyphenated_words: 47713 chars
After remove_figures_tables: 47713 chars
After remove_citation

Batch 1:  10%|█         | 1/10 [00:04<00:42,  4.75s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 626 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_13287_2023_Article_3526.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_13287_2023_Article_3526.txt
Percentage of words retained after cleaning: 86.79%

Processing file: EMMM-14-e15941-compressed.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 15087
Starting preprocessing. Initial text length: 93780
After reassemble_hyphenated_words: 93624 chars
After remove_figures_tables: 93606 chars
After remove_citations: 91854 chars
After remove_urls: 91709 chars
After remove_emails: 91642 chars
After remove_numerical_references: 91642 chars
After remove_headers: 91642 chars
After remove_references: 91642 chars
After remove_metadata: 916

Batch 1:  20%|██        | 2/10 [00:12<00:53,  6.70s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 968 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_EMMM-14-e15941-compressed.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_EMMM-14-e15941-compressed.txt
Percentage of words retained after cleaning: 97.29%

Processing file: 12886_2023_Article_2772-compressed.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 5451
Starting preprocessing. Initial text length: 32368
After reassemble_hyphenated_words: 32344 chars
After remove_figures_tables: 32344 chars
After remove_citations: 30858 chars
After remove_urls: 30795 chars
After remove_emails: 30782 chars
After remove_numerical_references: 30782 chars
After remove_headers: 30772 chars
After remove_references: 30772 chars
After remove_m

Batch 1:  30%|███       | 3/10 [00:15<00:34,  4.92s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 736 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_12886_2023_Article_2772-compressed.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_12886_2023_Article_2772-compressed.txt
Percentage of words retained after cleaning: 93.45%

Processing file: 13023_2021_Article_2145.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 6776
Starting preprocessing. Initial text length: 44620
After reassemble_hyphenated_words: 44570 chars
After remove_figures_tables: 44570 chars
After remove_citations: 41943 chars
After remove_urls: 41771 chars
After remove_emails: 41753 chars
After remove_numerical_references: 41753 chars
After remove_headers: 41743 chars
After remove_references: 41743 chars
After r

Batch 1:  40%|████      | 4/10 [00:19<00:26,  4.38s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 620 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_13023_2021_Article_2145.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_13023_2021_Article_2145.txt
Percentage of words retained after cleaning: 91.12%

Processing file: 41433_2022_Article_2262.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 3707
Starting preprocessing. Initial text length: 22967
After reassemble_hyphenated_words: 22945 chars
After remove_figures_tables: 22945 chars
After remove_citations: 21575 chars
After remove_urls: 21461 chars
After remove_emails: 21439 chars
After remove_numerical_references: 21439 chars
After remove_headers: 21439 chars
After remove_references: 21439 chars
After remove_metadata: 21351 

Batch 1:  50%|█████     | 5/10 [00:20<00:17,  3.46s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 508 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_41433_2022_Article_2262.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_41433_2022_Article_2262.txt
Percentage of words retained after cleaning: 91.99%

Processing file: emss-80329.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 15309
Starting preprocessing. Initial text length: 100603
After reassemble_hyphenated_words: 100554 chars
After remove_figures_tables: 100554 chars
After remove_citations: 87764 chars
After remove_urls: 87764 chars
After remove_emails: 87745 chars
After remove_numerical_references: 87745 chars
After remove_headers: 87745 chars
After remove_references: 87745 chars
After remove_metadata: 87745 chars
Aft

Batch 1:  60%|██████    | 6/10 [00:29<00:20,  5.10s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 854 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_emss-80329.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_emss-80329.txt
Percentage of words retained after cleaning: 85.66%

Processing file: 41436_2020_Article_759.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 6909
Starting preprocessing. Initial text length: 44036
After reassemble_hyphenated_words: 43988 chars
After remove_figures_tables: 43988 chars
After remove_citations: 41017 chars
After remove_urls: 40797 chars
After remove_emails: 40764 chars
After remove_numerical_references: 40764 chars
After remove_headers: 40764 chars
After remove_references: 40764 chars
After remove_metadata: 40705 chars
After remove_institut

Batch 1:  70%|███████   | 7/10 [00:36<00:17,  5.68s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 851 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_41436_2020_Article_759.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_41436_2020_Article_759.txt
Percentage of words retained after cleaning: 91.61%

Processing file: 13023_2023_Article_2798.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 9875
Starting preprocessing. Initial text length: 65921
After reassemble_hyphenated_words: 65855 chars
After remove_figures_tables: 65842 chars
After remove_citations: 60900 chars
After remove_urls: 60793 chars
After remove_emails: 60763 chars
After remove_numerical_references: 60763 chars
After remove_headers: 60228 chars
After remove_references: 60228 chars
After remove_metadata: 60179 ch

Batch 1:  80%|████████  | 8/10 [00:41<00:11,  5.59s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 736 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_13023_2023_Article_2798.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_13023_2023_Article_2798.txt
Percentage of words retained after cleaning: 88.61%

Processing file: diagnostics-13-00850.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 6913
Starting preprocessing. Initial text length: 41753
After reassemble_hyphenated_words: 41729 chars
After remove_figures_tables: 41729 chars
After remove_citations: 40843 chars
After remove_urls: 40629 chars
After remove_emails: 40595 chars
After remove_numerical_references: 40595 chars
After remove_headers: 40595 chars
After remove_references: 40595 chars
After remove_metadata: 40543 cha

Batch 1:  90%|█████████ | 9/10 [00:45<00:05,  5.02s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 311 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_diagnostics-13-00850.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_diagnostics-13-00850.txt
Percentage of words retained after cleaning: 97.08%

Processing file: diagnostics-13-02413-compressed.txt
Detected encoding: MacRoman (confidence: 0.68)
Successfully read file with MacRoman encoding.
Original word count: 18516
Starting preprocessing. Initial text length: 116515
After reassemble_hyphenated_words: 116351 chars
After remove_figures_tables: 116351 chars
After remove_citations: 113399 chars
After remove_urls: 113265 chars
After remove_emails: 113098 chars
After remove_numerical_references: 113098 chars
After remove_headers: 113098 chars
After remove_references: 113098 chars
After remov

Batch 1: 100%|██████████| 10/10 [00:55<00:00,  5.52s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 581 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_diagnostics-13-02413-compressed.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_diagnostics-13-02413-compressed.txt
Percentage of words retained after cleaning: 96.62%
Completed processing batch 1
All batches processed.
Preprocessing complete.



