In [42]:
# Import libraries
import spacy
from nltk.corpus import stopwords
from collections import Counter
import nltk
from scispacy.linking import EntityLinker
import scispacy
from tqdm import tqdm
import re
import unicodedata
import os
import logging
import chardet
from collections import OrderedDict

%matplotlib inline
import matplotlib.pyplot as plt

In [43]:
# nltk.download('punkt')
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

nlp = spacy.load("en_core_sci_md")
nlp.add_pipe("scispacy_linker", last=True)

print("Setup completed.")

Setup completed.


In [44]:
# reassemble_hyphenated_words
def reassemble_hyphenated_words(text):
    return re.sub(r'(\w+)-\s*\n(\w+)', r'\1\2', text)

def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def remove_emails(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    return re.sub(email_pattern, '', text, flags=re.IGNORECASE)

def remove_figures_tables(text):
    return re.sub(r'\b(figures?|tables?)\b', '', text)

def remove_numerical_references(text):
    return re.sub(r'\[\d+\]', '', text)

def remove_citations(text):
    patterns = [
        r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\s+[A-Z][a-z]+\s+et\s+al\.',
        r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\s+et\s+al\.',
        r'\b[A-Z][a-z]+\s+et\s+al\.',
        r'\(.*?et al\..*?\d{4}.*?\)',
        r'\[.*?\]',
        r'\(\d{4}[a-z]?(?:,\s*\d{4}[a-z]?)*\)',
        r'^.*?\d{4};.*?:\s*\d+.*?$',
        r'^.*?\d{4};.*?:\s*\d+.*?$',  # Matches journal info like "2024;258: 119– 129"
    ]

    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE | re.IGNORECASE)
    return text

def remove_headers(text):
    # Remove lines that are all uppercase and end with a colon
    text = re.sub(r'^[A-Z\s]+:$', '', text, flags=re.MULTILINE)
    # Remove lines that start with bullet points
    text = re.sub(r'^\s*•.*$', '', text, flags=re.MULTILINE)
    return text

def remove_metadata(text):
    # Remove headers, copyright info, DOI, received/accepted dates
    patterns = [
        r'^.*?©Copyright.*$',
        r'^DOI:.*$',
        r'^Received:.*$',
        r'^Accepted:.*$',
        r'^Address for Correspondence:.*$',
        r'^E-mail:.*$',
        r'^ORCID-ID:.*$',
        r'^\s*\d+\s*$',  # Page numbers
        r'^.*?ORCID:.*$',
        r'^Cite this article as:.*$',
        r'\[\s*[^\w\s]*\s*\]'
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return text

def remove_institution_names(text):
    # Remove institution names (this is a simplified approach and may need refinement)
    pattern = r'\*+[A-Z][A-Za-z\s,]+(University|Institute|Hospital|Department|Faculty)[^\n]*'
    return re.sub(pattern, '', text, flags=re.MULTILINE)

def handle_special_characters(text):
    """Handle special characters and Unicode normalization."""
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
    char_map = {
        '´': "'", '‘': "'", '’': "'", '“': '"', '”': '"', '–': '-', '—': '-', '…': '...'
    }
    for char, replacement in char_map.items():
        text = text.replace(char, replacement)
    return text

def remove_copyright_info(text):
    patterns = [
        r'^©.*$',
        r'Copyright.*$',
        r'This is an open access article.*$',
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE | re.IGNORECASE)
    return text

def remove_doi_and_journal_info(text):
    patterns = [
        r'DOI:.*$',
        r'^.*?\d{4};\d+:\d+–\d+',  # Matches journal info like "2024;258: 119– 129"
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return text

def remove_artifacts(text):
    # Remove license and DOI information
    text = re.sub(r'BY license \(.*?\)\..*?commons\.org/licenses/by/\d\.\d/\s*\)\.', '', text)
    text = re.sub(r'://doi\.org/\d+\.\d+/[^\s]+', '', text )
    text = re.sub(r'://creativecommons\.org/licenses/by/\d\.\d/', '', text)
    text = re.sub(r'://creativecommons\.org/licenses/by-\w+/\d\.\d/', '', text)
    
    
    # Remove unnecessary symbols
    text = re.sub(r'[⁎\]]', '', text)
    
    # Remove empty parentheses and brackets
    text = re.sub(r'\(\s*\)|\[\s*\]', '', text)
    
    # Remove isolated semicolons
    text = re.sub(r'\s*;\s*', ' ', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

def detect_sentence_boundaries(text):
    """
    Detect sentence boundaries using spaCy.
    """
    doc = nlp(text)
    spacy_sentences = [sent.text for sent in doc.sents]
    return ' '.join(spacy_sentences)

In [45]:
def count_words(text):
    """Count the number of words in the given text."""
    # Split on word boundaries
    words = re.findall(r'\b[\w\'-]+\b', text)
    # Filter out purely numeric "words"
    return len([word for word in words if not word.isdigit()])

In [46]:
def extract_abstract_and_main_text(text):
    # Possible sections indicating the beginning of the abstract
    abstract_start_patterns = [
        r'\bAbstract\b',
        r'\bBackground\b',
        r'\bPurpose\b'
    ]
    
    # Possible sections indicating the end of the abstract
    abstract_end_patterns = [
        r'\bKeywords\b',
        r'\bIntroduction\b'
    ]
    
    # Initialize abstract boundaries
    abstract_start = None
    abstract_end = None
    
    # Search for the start of the abstract using standard patterns
    for pattern in abstract_start_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            abstract_start = match.end()
            break
    
    # If no start pattern is found, assume the abstract starts after any metadata or headers
    if not abstract_start:
        lines = text.splitlines()
        for line in lines:
            if re.search(r'\bAuthor[s]?|Acknowledgment[s]?|Declaration[s]?\b', line, re.IGNORECASE):
                continue
            if len(line.strip()) > 30 and not re.search(r'\[\d+\]', line):
                abstract_start = text.find(line)
                break

    # Exit if we cannot determine the start of the abstract
    if abstract_start is None:
        return "Abstract not found.", text
    
    # Search for the end of the abstract
    for pattern in abstract_end_patterns:
        match = re.search(pattern, text[abstract_start:], re.IGNORECASE)
        if match:
            abstract_end = abstract_start + match.start()
            break
    
    # Default to the end of the text if no clear end pattern is found
    if not abstract_end:
        abstract_end = len(text)
    
    # Extract the abstract content
    abstract = text[abstract_start:abstract_end].strip()
    
    # Extract the main text starting from the end of the abstract
    main_text = text[abstract_end:].strip()

    # Optionally, limit the abstract to a certain number of words
    max_words = 250
    words = abstract.split()
    if len(words) > max_words:
        abstract = ' '.join(words[:max_words])
    
    return abstract, main_text


In [47]:
def preprocess_text(text):
    try:
        print(f"Starting preprocessing...")
        initial_word_count = count_words(text)
        print(f"Initial word count: {initial_word_count}")

        # Extract abstract
        abstract, main_text = extract_abstract_and_main_text(text)
        print(f"Abstract extracted. Abstract word count: {count_words(abstract)}")
        print(f"Main text word count: {count_words(main_text)}")

        # Step 1: Initial text cleaning
        main_text = reassemble_hyphenated_words(main_text)
        print(f"After reassemble_hyphenated_words: {count_words(main_text)} words")

        main_text = remove_figures_tables(main_text)
        print(f"After remove_figures_tables: {count_words(main_text)} words")

        main_text = remove_citations(main_text)
        print(f"After remove_citations: {count_words(main_text)} words")

        main_text = remove_urls(main_text)
        print(f"After remove_urls: {count_words(main_text)} words")

        main_text = remove_emails(main_text)
        print(f"After remove_emails: {count_words(main_text)} words")

        main_text = remove_numerical_references(main_text)
        print(f"After remove_numerical_references: {count_words(main_text)} words")

        main_text = remove_headers(main_text)
        print(f"After remove_headers: {count_words(main_text)} words")

        main_text = remove_metadata(main_text)
        print(f"After remove_metadata: {count_words(main_text)} words")

        main_text = remove_institution_names(main_text)
        print(f"After remove_institution_names: {count_words(main_text)} words")
        
        main_text = remove_copyright_info(main_text)
        print(f"After remove_copyright_info: {count_words(main_text)} words")

        main_text = remove_doi_and_journal_info(main_text)
        print(f"After remove_doi_and_journal_info: {count_words(main_text)} words")

        main_text = remove_artifacts(main_text)
        print(f"After remove_artifacts: {count_words(main_text)} words")

        # Remove extra whitespace
        main_text = ' '.join(main_text.split())
        print(f"After removing extra whitespace: {count_words(main_text)} words")
    
        main_text = handle_special_characters(main_text)
        print(f"After handle_special_characters: {count_words(main_text)} words")

        main_text = detect_sentence_boundaries(main_text)
        print(f"After detect_sentence_boundaries: {count_words(main_text)} words")

        final_word_count = count_words(main_text)
        print(f"Final preprocessed word count: {final_word_count}")

        if initial_word_count > 0:
            percentage_retained = (final_word_count / initial_word_count) * 100
            print(f"Percentage of words retained after cleaning: {percentage_retained:.2f}%")
        else:
            print("Original text contains no words; cannot calculate percentage.")

        return abstract, main_text.strip()

    except Exception as e:
        logging.error(f"Error in preprocess_text: {e}")
        return None, ""

In [48]:
import os
import chardet
from tqdm import tqdm
import logging

def process_files(input_folder_path: str, output_folder_path: str, batch_size: int = 10) -> None:
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    files = [f for f in os.listdir(input_folder_path) if f.endswith('.txt')]
    total_files = len(files)

    for i in range(0, total_files, batch_size):
        batch = files[i:i+batch_size]
        print(f"\nProcessing batch {i//batch_size + 1} of {(total_files-1)//batch_size + 1}")

        for filename in tqdm(batch, desc=f"Batch {i//batch_size + 1}"):
            input_file_path = os.path.join(input_folder_path, filename)
            output_file_path = os.path.join(output_folder_path, f"processed_{filename}")

            print(f"\nProcessing file: {filename}")

            try:
                with open(input_file_path, 'rb') as file:
                    raw_data = file.read()
                    result = chardet.detect(raw_data)
                    detected_encoding = result['encoding']
                    confidence = result['confidence']

                with open(input_file_path, 'r', encoding=detected_encoding or 'utf-8') as file:
                    original_text = file.read()

                # Extract abstract and clean main text
                abstract, cleaned_text = preprocess_text(original_text)

                # Prepare output text
                output_text = f"Abstract:\n{abstract}\n\nMain Text:\n{cleaned_text}"

                # Write the cleaned text and abstract to the output file
                with open(output_file_path, 'w', encoding='utf-8') as file:
                    file.write(output_text)

            except Exception as e:
                logging.error(f"Error processing file {filename}: {str(e)}")

            print("=" * 100)

        print(f"Completed processing batch {i//batch_size + 1}")

    print("All batches processed.")


In [49]:
def identify_frequent_terms(text, n=10, min_length=3):

    # Tokenize the text into words
    words = re.findall(r'\b[a-zA-Z]{' + str(min_length) + r',}\b', text.lower())
    
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    
    # Count word frequencies
    word_freq = Counter(words)
    
    # Return the n most common words with their counts
    return word_freq.most_common(n)

In [50]:
if __name__ == "__main__":
    # input_folder = "/mnt/data/skanda/MSc_IRD_LLM/data/txt_data"
    input_folder = "/mnt/data/skanda/MSc_IRD_LLM/data/txt_data"
    output_folder = "/mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed"
    process_files(input_folder, output_folder, batch_size=10)
    print("Preprocessing complete.")


Processing batch 1 of 6


Batch 1:   0%|          | 0/10 [00:00<?, ?it/s]


Processing file: 10.1177_2515841420954592.txt
Starting preprocessing...
Initial word count: 14885
Abstract extracted. Abstract word count: 177
Main text word count: 14313
After reassemble_hyphenated_words: 14281 words
After remove_figures_tables: 14280 words
After remove_citations: 13116 words
After remove_urls: 12923 words
After remove_emails: 12918 words
After remove_numerical_references: 12918 words
After remove_headers: 12475 words
After remove_metadata: 12475 words
After remove_institution_names: 12475 words
After remove_copyright_info: 12475 words
After remove_doi_and_journal_info: 12471 words
After remove_artifacts: 12471 words
After removing extra whitespace: 12471 words
After handle_special_characters: 12447 words


Batch 1:  10%|█         | 1/10 [00:02<00:23,  2.58s/it]

After detect_sentence_boundaries: 12447 words
Final preprocessed word count: 12447
Percentage of words retained after cleaning: 83.62%

Processing file: 13287_2023_Article_3526.txt
Starting preprocessing...
Initial word count: 6492
Abstract extracted. Abstract word count: 199
Main text word count: 6159
After reassemble_hyphenated_words: 6135 words
After remove_figures_tables: 6135 words
After remove_citations: 5663 words
After remove_urls: 5648 words
After remove_emails: 5645 words
After remove_numerical_references: 5645 words
After remove_headers: 5645 words
After remove_metadata: 5641 words
After remove_institution_names: 5641 words
After remove_copyright_info: 5616 words
After remove_doi_and_journal_info: 5616 words
After remove_artifacts: 5616 words
After removing extra whitespace: 5616 words
After handle_special_characters: 5600 words


Batch 1:  20%|██        | 2/10 [00:03<00:14,  1.83s/it]

After detect_sentence_boundaries: 5600 words
Final preprocessed word count: 5600
Percentage of words retained after cleaning: 86.26%

Processing file: ijms-22-05684.txt
Starting preprocessing...
Initial word count: 14383
Abstract extracted. Abstract word count: 195
Main text word count: 14025
After reassemble_hyphenated_words: 13986 words
After remove_figures_tables: 13986 words
After remove_citations: 13737 words
After remove_urls: 13687 words
After remove_emails: 13687 words
After remove_numerical_references: 13687 words
After remove_headers: 13687 words
After remove_metadata: 13687 words
After remove_institution_names: 13687 words
After remove_copyright_info: 13687 words
After remove_doi_and_journal_info: 13687 words
After remove_artifacts: 13687 words
After removing extra whitespace: 13687 words
After handle_special_characters: 13652 words


Batch 1:  30%|███       | 3/10 [00:06<00:15,  2.26s/it]

After detect_sentence_boundaries: 13652 words
Final preprocessed word count: 13652
Percentage of words retained after cleaning: 94.92%

Processing file: bjophthalmol-2020-315878.txt
Starting preprocessing...
Initial word count: 4383
Abstract extracted. Abstract word count: 198
Main text word count: 4012
After reassemble_hyphenated_words: 3992 words
After remove_figures_tables: 3987 words
After remove_citations: 3716 words
After remove_urls: 3709 words
After remove_emails: 3709 words
After remove_numerical_references: 3709 words
After remove_headers: 3709 words
After remove_metadata: 3709 words
After remove_institution_names: 3709 words
After remove_copyright_info: 3698 words
After remove_doi_and_journal_info: 3698 words
After remove_artifacts: 3698 words
After removing extra whitespace: 3698 words
After handle_special_characters: 3693 words


Batch 1:  40%|████      | 4/10 [00:07<00:10,  1.70s/it]

After detect_sentence_boundaries: 3693 words
Final preprocessed word count: 3693
Percentage of words retained after cleaning: 84.26%

Processing file: iovs-63-2-11.txt
Starting preprocessing...
Initial word count: 5734
Abstract extracted. Abstract word count: 261
Main text word count: 5332
After reassemble_hyphenated_words: 5233 words
After remove_figures_tables: 5233 words
After remove_citations: 4887 words
After remove_urls: 4887 words
After remove_emails: 4887 words
After remove_numerical_references: 4887 words
After remove_headers: 4887 words
After remove_metadata: 4887 words
After remove_institution_names: 4887 words
After remove_copyright_info: 4884 words
After remove_doi_and_journal_info: 4884 words
After remove_artifacts: 4884 words
After removing extra whitespace: 4884 words
After handle_special_characters: 4878 words


Batch 1:  50%|█████     | 5/10 [00:08<00:07,  1.47s/it]

After detect_sentence_boundaries: 4878 words
Final preprocessed word count: 4878
Percentage of words retained after cleaning: 85.07%

Processing file: EMMM-14-e15941-compressed.txt
Starting preprocessing...
Initial word count: 13488
Abstract extracted. Abstract word count: 195
Main text word count: 13237
After reassemble_hyphenated_words: 13162 words
After remove_figures_tables: 13159 words
After remove_citations: 12949 words
After remove_urls: 12925 words
After remove_emails: 12914 words
After remove_numerical_references: 12914 words
After remove_headers: 12914 words
After remove_metadata: 12914 words
After remove_institution_names: 12914 words
After remove_copyright_info: 12906 words
After remove_doi_and_journal_info: 12906 words
After remove_artifacts: 12906 words
After removing extra whitespace: 12906 words
After handle_special_characters: 12837 words


Batch 1:  60%|██████    | 6/10 [00:11<00:07,  1.87s/it]

After detect_sentence_boundaries: 12837 words
Final preprocessed word count: 12837
Percentage of words retained after cleaning: 95.17%

Processing file: jci-133-171356.txt
Starting preprocessing...
Initial word count: 11482
Abstract extracted. Abstract word count: 232
Main text word count: 2129
After reassemble_hyphenated_words: 2109 words
After remove_figures_tables: 2109 words
After remove_citations: 2000 words
After remove_urls: 1995 words
After remove_emails: 1991 words
After remove_numerical_references: 1991 words
After remove_headers: 1991 words
After remove_metadata: 1991 words
After remove_institution_names: 1991 words
After remove_copyright_info: 1991 words
After remove_doi_and_journal_info: 1991 words
After remove_artifacts: 1991 words
After removing extra whitespace: 1991 words
After handle_special_characters: 1974 words


Batch 1:  70%|███████   | 7/10 [00:14<00:07,  2.40s/it]

After detect_sentence_boundaries: 1974 words
Final preprocessed word count: 1974
Percentage of words retained after cleaning: 17.19%

Processing file: 12886_2023_Article_2772-compressed.txt
Starting preprocessing...
Initial word count: 4683
Abstract extracted. Abstract word count: 220
Main text word count: 4078
After reassemble_hyphenated_words: 4067 words
After remove_figures_tables: 4067 words
After remove_citations: 3931 words
After remove_urls: 3930 words
After remove_emails: 3928 words
After remove_numerical_references: 3928 words
After remove_headers: 3928 words
After remove_metadata: 3924 words
After remove_institution_names: 3924 words
After remove_copyright_info: 3924 words
After remove_doi_and_journal_info: 3924 words
After remove_artifacts: 3924 words
After removing extra whitespace: 3924 words
After handle_special_characters: 3913 words


Batch 1:  80%|████████  | 8/10 [00:15<00:03,  1.90s/it]

After detect_sentence_boundaries: 3913 words
Final preprocessed word count: 3913
Percentage of words retained after cleaning: 83.56%

Processing file: genes-12-00147.txt
Starting preprocessing...
Initial word count: 8521
Abstract extracted. Abstract word count: 149
Main text word count: 8123
After reassemble_hyphenated_words: 8091 words
After remove_figures_tables: 8091 words
After remove_citations: 7951 words
After remove_urls: 7934 words
After remove_emails: 7934 words
After remove_numerical_references: 7934 words
After remove_headers: 7934 words
After remove_metadata: 7934 words
After remove_institution_names: 7934 words
After remove_copyright_info: 7934 words
After remove_doi_and_journal_info: 7934 words
After remove_artifacts: 7934 words
After removing extra whitespace: 7934 words
After handle_special_characters: 7895 words


Batch 1:  90%|█████████ | 9/10 [00:17<00:01,  1.81s/it]

After detect_sentence_boundaries: 7895 words
Final preprocessed word count: 7895
Percentage of words retained after cleaning: 92.65%

Processing file: nihms-1927912.txt
Starting preprocessing...
Initial word count: 9916
Abstract extracted. Abstract word count: 109
Main text word count: 9670
After reassemble_hyphenated_words: 9662 words
After remove_figures_tables: 9660 words
After remove_citations: 9498 words
After remove_urls: 9490 words
After remove_emails: 9486 words
After remove_numerical_references: 9486 words
After remove_headers: 9486 words
After remove_metadata: 9486 words
After remove_institution_names: 9486 words
After remove_copyright_info: 9486 words
After remove_doi_and_journal_info: 9486 words
After remove_artifacts: 9486 words
After removing extra whitespace: 9486 words
After handle_special_characters: 9477 words


Batch 1: 100%|██████████| 10/10 [00:19<00:00,  1.92s/it]


After detect_sentence_boundaries: 9477 words
Final preprocessed word count: 9477
Percentage of words retained after cleaning: 95.57%
Completed processing batch 1

Processing batch 2 of 6


Batch 2:   0%|          | 0/10 [00:00<?, ?it/s]


Processing file: 13023_2021_Article_2145.txt
Starting preprocessing...
Initial word count: 6208
Abstract extracted. Abstract word count: 133
Main text word count: 6038
After reassemble_hyphenated_words: 6017 words
After remove_figures_tables: 6017 words
After remove_citations: 5779 words
After remove_urls: 5759 words
After remove_emails: 5755 words
After remove_numerical_references: 5755 words
After remove_headers: 5755 words
After remove_metadata: 5751 words
After remove_institution_names: 5751 words
After remove_copyright_info: 5726 words
After remove_doi_and_journal_info: 5726 words
After remove_artifacts: 5726 words
After removing extra whitespace: 5726 words
After handle_special_characters: 5709 words


Batch 2:  10%|█         | 1/10 [00:01<00:10,  1.20s/it]

After detect_sentence_boundaries: 5709 words
Final preprocessed word count: 5709
Percentage of words retained after cleaning: 91.96%

Processing file: jcm-12-06953.txt
Starting preprocessing...
Initial word count: 11423
Abstract extracted. Abstract word count: 176
Main text word count: 11133
After reassemble_hyphenated_words: 11103 words
After remove_figures_tables: 11103 words
After remove_citations: 10854 words
After remove_urls: 10844 words
After remove_emails: 10844 words
After remove_numerical_references: 10844 words
After remove_headers: 10844 words
After remove_metadata: 10844 words
After remove_institution_names: 10844 words
After remove_copyright_info: 10844 words
After remove_doi_and_journal_info: 10844 words
After remove_artifacts: 10844 words
After removing extra whitespace: 10844 words
After handle_special_characters: 10823 words


Batch 2:  20%|██        | 2/10 [00:03<00:14,  1.82s/it]

After detect_sentence_boundaries: 10823 words
Final preprocessed word count: 10823
Percentage of words retained after cleaning: 94.75%

Processing file: IJO-70-2316.txt
Starting preprocessing...
Initial word count: 7953
Abstract extracted. Abstract word count: 133
Main text word count: 7820
After reassemble_hyphenated_words: 7820 words
After remove_figures_tables: 7820 words
After remove_citations: 7508 words
After remove_urls: 7491 words
After remove_emails: 7485 words
After remove_numerical_references: 7485 words
After remove_headers: 7485 words
After remove_metadata: 7456 words
After remove_institution_names: 7456 words
After remove_copyright_info: 7456 words
After remove_doi_and_journal_info: 7456 words
After remove_artifacts: 7456 words
After removing extra whitespace: 7456 words
After handle_special_characters: 7217 words


Batch 2:  30%|███       | 3/10 [00:04<00:11,  1.67s/it]

After detect_sentence_boundaries: 7217 words
Final preprocessed word count: 7217
Percentage of words retained after cleaning: 90.75%

Processing file: TJO-53-44.txt
Starting preprocessing...
Initial word count: 9202
Abstract extracted. Abstract word count: 71
Main text word count: 8960
After reassemble_hyphenated_words: 8951 words
After remove_figures_tables: 8951 words
After remove_citations: 8619 words
After remove_urls: 8619 words
After remove_emails: 8616 words
After remove_numerical_references: 8616 words
After remove_headers: 8169 words
After remove_metadata: 8127 words
After remove_institution_names: 8127 words
After remove_copyright_info: 8125 words
After remove_doi_and_journal_info: 8125 words
After remove_artifacts: 8125 words
After removing extra whitespace: 8125 words
After handle_special_characters: 8122 words


Batch 2:  40%|████      | 4/10 [00:06<00:10,  1.68s/it]

After detect_sentence_boundaries: 8122 words
Final preprocessed word count: 8122
Percentage of words retained after cleaning: 88.26%

Processing file: biomolecules-12-00455-compressed.txt
Starting preprocessing...
Initial word count: 14007
Abstract extracted. Abstract word count: 253
Main text word count: 13463
After reassemble_hyphenated_words: 13366 words
After remove_figures_tables: 13366 words
After remove_citations: 13239 words
After remove_urls: 13223 words
After remove_emails: 13223 words
After remove_numerical_references: 13223 words
After remove_headers: 13223 words
After remove_metadata: 13223 words
After remove_institution_names: 13223 words
After remove_copyright_info: 13223 words
After remove_doi_and_journal_info: 13223 words
After remove_artifacts: 13223 words
After removing extra whitespace: 13223 words
After handle_special_characters: 13092 words


Batch 2:  50%|█████     | 5/10 [00:09<00:10,  2.12s/it]

After detect_sentence_boundaries: 13092 words
Final preprocessed word count: 13092
Percentage of words retained after cleaning: 93.47%

Processing file: 1-s2.0-S1350946223000447-main.txt
Starting preprocessing...
Initial word count: 34518
Abstract extracted. Abstract word count: 182
Main text word count: 34215
After reassemble_hyphenated_words: 34025 words
After remove_figures_tables: 34025 words
After remove_citations: 32695 words
After remove_urls: 32690 words
After remove_emails: 32680 words
After remove_numerical_references: 32680 words
After remove_headers: 32680 words
After remove_metadata: 32680 words
After remove_institution_names: 32680 words
After remove_copyright_info: 32680 words
After remove_doi_and_journal_info: 32680 words
After remove_artifacts: 32680 words
After removing extra whitespace: 32680 words
After handle_special_characters: 32634 words


Batch 2:  60%|██████    | 6/10 [00:18<00:17,  4.32s/it]

After detect_sentence_boundaries: 32634 words
Final preprocessed word count: 32634
Percentage of words retained after cleaning: 94.54%

Processing file: jmedgenet-2016-103837.txt
Starting preprocessing...
Initial word count: 6064
Abstract extracted. Abstract word count: 214
Main text word count: 5720
After reassemble_hyphenated_words: 5693 words
After remove_figures_tables: 5674 words
After remove_citations: 5347 words
After remove_urls: 5338 words
After remove_emails: 5338 words
After remove_numerical_references: 5338 words
After remove_headers: 5338 words
After remove_metadata: 5338 words
After remove_institution_names: 5338 words
After remove_copyright_info: 5327 words
After remove_doi_and_journal_info: 5327 words
After remove_artifacts: 5327 words
After removing extra whitespace: 5327 words
After handle_special_characters: 5322 words


Batch 2:  70%|███████   | 7/10 [00:19<00:09,  3.27s/it]

After detect_sentence_boundaries: 5322 words
Final preprocessed word count: 5322
Percentage of words retained after cleaning: 87.76%

Processing file: 41433_2022_Article_2262.txt
Starting preprocessing...
Initial word count: 3318
Abstract extracted. Abstract word count: 245
Main text word count: 3026
After reassemble_hyphenated_words: 3015 words
After remove_figures_tables: 3015 words
After remove_citations: 2862 words
After remove_urls: 2853 words
After remove_emails: 2848 words
After remove_numerical_references: 2848 words
After remove_headers: 2848 words
After remove_metadata: 2842 words
After remove_institution_names: 2842 words
After remove_copyright_info: 2842 words
After remove_doi_and_journal_info: 2842 words
After remove_artifacts: 2842 words
After removing extra whitespace: 2842 words
After handle_special_characters: 2841 words


Batch 2:  80%|████████  | 8/10 [00:19<00:04,  2.41s/it]

After detect_sentence_boundaries: 2841 words
Final preprocessed word count: 2841
Percentage of words retained after cleaning: 85.62%

Processing file: emss-80329.txt
Starting preprocessing...
Initial word count: 13597
Abstract extracted. Abstract word count: 143
Main text word count: 13399
After reassemble_hyphenated_words: 13378 words
After remove_figures_tables: 13378 words
After remove_citations: 11852 words
After remove_urls: 11852 words
After remove_emails: 11847 words
After remove_numerical_references: 11847 words
After remove_headers: 11847 words
After remove_metadata: 11847 words
After remove_institution_names: 11847 words
After remove_copyright_info: 11847 words
After remove_doi_and_journal_info: 11847 words
After remove_artifacts: 11847 words
After removing extra whitespace: 11847 words
After handle_special_characters: 11814 words


Batch 2:  90%|█████████ | 9/10 [00:22<00:02,  2.48s/it]

After detect_sentence_boundaries: 11814 words
Final preprocessed word count: 11814
Percentage of words retained after cleaning: 86.89%

Processing file: nihms-1747988.txt
Starting preprocessing...
Initial word count: 2102
Abstract extracted. Abstract word count: 191
Main text word count: 1584
After reassemble_hyphenated_words: 1583 words
After remove_figures_tables: 1583 words
After remove_citations: 1495 words
After remove_urls: 1490 words
After remove_emails: 1490 words
After remove_numerical_references: 1490 words
After remove_headers: 1490 words
After remove_metadata: 1490 words
After remove_institution_names: 1490 words
After remove_copyright_info: 1490 words
After remove_doi_and_journal_info: 1490 words
After remove_artifacts: 1490 words
After removing extra whitespace: 1490 words
After handle_special_characters: 1488 words


Batch 2: 100%|██████████| 10/10 [00:22<00:00,  2.27s/it]


After detect_sentence_boundaries: 1488 words
Final preprocessed word count: 1488
Percentage of words retained after cleaning: 70.79%
Completed processing batch 2

Processing batch 3 of 6


Batch 3:   0%|          | 0/10 [00:00<?, ?it/s]


Processing file: TJP-600-4623.txt
Starting preprocessing...
Initial word count: 6963
Abstract extracted. Abstract word count: 258
Main text word count: 39
After reassemble_hyphenated_words: 39 words
After remove_figures_tables: 39 words
After remove_citations: 39 words
After remove_urls: 39 words
After remove_emails: 39 words
After remove_numerical_references: 39 words
After remove_headers: 39 words
After remove_metadata: 39 words
After remove_institution_names: 39 words
After remove_copyright_info: 21 words
After remove_doi_and_journal_info: 21 words
After remove_artifacts: 21 words
After removing extra whitespace: 21 words
After handle_special_characters: 21 words
After detect_sentence_boundaries: 21 words
Final preprocessed word count: 21
Percentage of words retained after cleaning: 0.30%

Processing file: main (1).txt
Starting preprocessing...
Initial word count: 5884
Abstract extracted. Abstract word count: 251
Main text word count: 5252
After reassemble_hyphenated_words: 5111 wo

Batch 3:  20%|██        | 2/10 [00:01<00:04,  1.88it/s]

After detect_sentence_boundaries: 4709 words
Final preprocessed word count: 4709
Percentage of words retained after cleaning: 80.03%

Processing file: ijms-22-04534.txt
Starting preprocessing...
Initial word count: 11997
Abstract extracted. Abstract word count: 198
Main text word count: 11401
After reassemble_hyphenated_words: 11346 words
After remove_figures_tables: 11346 words
After remove_citations: 11141 words
After remove_urls: 11131 words
After remove_emails: 11131 words
After remove_numerical_references: 11131 words
After remove_headers: 11131 words
After remove_metadata: 11131 words
After remove_institution_names: 11131 words
After remove_copyright_info: 11131 words
After remove_doi_and_journal_info: 11131 words
After remove_artifacts: 11131 words
After removing extra whitespace: 11131 words
After handle_special_characters: 11101 words


Batch 3:  30%|███       | 3/10 [00:03<00:08,  1.24s/it]

After detect_sentence_boundaries: 11101 words
Final preprocessed word count: 11101
Percentage of words retained after cleaning: 92.53%

Processing file: 41436_2020_Article_759.txt
Starting preprocessing...
Initial word count: 6235
Abstract extracted. Abstract word count: 212
Main text word count: 5964
After reassemble_hyphenated_words: 5941 words
After remove_figures_tables: 5941 words
After remove_citations: 5613 words
After remove_urls: 5587 words
After remove_emails: 5583 words
After remove_numerical_references: 5583 words
After remove_headers: 5583 words
After remove_metadata: 5583 words
After remove_institution_names: 5583 words
After remove_copyright_info: 5567 words
After remove_doi_and_journal_info: 5567 words
After remove_artifacts: 5567 words
After removing extra whitespace: 5567 words
After handle_special_characters: 5566 words


Batch 3:  40%|████      | 4/10 [00:04<00:07,  1.20s/it]

After detect_sentence_boundaries: 5566 words
Final preprocessed word count: 5566
Percentage of words retained after cleaning: 89.27%

Processing file: JCO-34-80.txt
Starting preprocessing...
Initial word count: 4720
Abstract extracted. Abstract word count: 214
Main text word count: 4390
After reassemble_hyphenated_words: 4390 words
After remove_figures_tables: 4389 words
After remove_citations: 4227 words
After remove_urls: 4224 words
After remove_emails: 4218 words
After remove_numerical_references: 4218 words
After remove_headers: 4218 words
After remove_metadata: 4217 words
After remove_institution_names: 4217 words
After remove_copyright_info: 4217 words
After remove_doi_and_journal_info: 4217 words
After remove_artifacts: 4217 words
After removing extra whitespace: 4217 words
After handle_special_characters: 4166 words


Batch 3:  50%|█████     | 5/10 [00:05<00:05,  1.10s/it]

After detect_sentence_boundaries: 4166 words
Final preprocessed word count: 4166
Percentage of words retained after cleaning: 88.26%

Processing file: opth-16-1127.txt
Starting preprocessing...
Initial word count: 5248
Abstract extracted. Abstract word count: 215
Main text word count: 4937
After reassemble_hyphenated_words: 4930 words
After remove_figures_tables: 4928 words
After remove_citations: 4686 words
After remove_urls: 4555 words
After remove_emails: 4555 words
After remove_numerical_references: 4555 words
After remove_headers: 4555 words
After remove_metadata: 4551 words
After remove_institution_names: 4551 words
After remove_copyright_info: 4531 words
After remove_doi_and_journal_info: 4531 words
After remove_artifacts: 4531 words
After removing extra whitespace: 4531 words
After handle_special_characters: 4507 words


Batch 3:  60%|██████    | 6/10 [00:06<00:04,  1.08s/it]

After detect_sentence_boundaries: 4507 words
Final preprocessed word count: 4507
Percentage of words retained after cleaning: 85.88%

Processing file: MGG3-9-e1663.txt
Starting preprocessing...
Initial word count: 5569
Abstract extracted. Abstract word count: 25
Main text word count: 5299
After reassemble_hyphenated_words: 5216 words
After remove_figures_tables: 5216 words
After remove_citations: 5126 words
After remove_urls: 5032 words
After remove_emails: 5032 words
After remove_numerical_references: 5032 words
After remove_headers: 5032 words
After remove_metadata: 5032 words
After remove_institution_names: 5032 words
After remove_copyright_info: 5032 words
After remove_doi_and_journal_info: 5032 words
After remove_artifacts: 5032 words
After removing extra whitespace: 5032 words
After handle_special_characters: 5028 words


Batch 3:  80%|████████  | 8/10 [00:07<00:01,  1.26it/s]

After detect_sentence_boundaries: 5028 words
Final preprocessed word count: 5028
Percentage of words retained after cleaning: 90.29%

Processing file: main copy.txt
Starting preprocessing...
Initial word count: 6262
Abstract extracted. Abstract word count: 224
Main text word count: 854
After reassemble_hyphenated_words: 841 words
After remove_figures_tables: 841 words
After remove_citations: 623 words
After remove_urls: 617 words
After remove_emails: 612 words
After remove_numerical_references: 612 words
After remove_headers: 612 words
After remove_metadata: 612 words
After remove_institution_names: 612 words
After remove_copyright_info: 612 words
After remove_doi_and_journal_info: 612 words
After remove_artifacts: 612 words
After removing extra whitespace: 612 words
After handle_special_characters: 612 words
After detect_sentence_boundaries: 612 words
Final preprocessed word count: 612
Percentage of words retained after cleaning: 9.77%

Processing file: 1-s2.0-S1350946221000367-main.t

Batch 3:  90%|█████████ | 9/10 [00:11<00:01,  1.81s/it]

After detect_sentence_boundaries: 18932 words
Final preprocessed word count: 18932
Percentage of words retained after cleaning: 91.49%

Processing file: fphar-12-654445.txt
Starting preprocessing...
Initial word count: 6259
Abstract extracted. Abstract word count: 210
Main text word count: 6049
After reassemble_hyphenated_words: 6037 words
After remove_figures_tables: 6037 words
After remove_citations: 5751 words
After remove_urls: 5727 words
After remove_emails: 5721 words
After remove_numerical_references: 5721 words
After remove_headers: 5721 words
After remove_metadata: 5717 words
After remove_institution_names: 5717 words
After remove_copyright_info: 5695 words
After remove_doi_and_journal_info: 5695 words
After remove_artifacts: 5695 words
After removing extra whitespace: 5695 words
After handle_special_characters: 5687 words


Batch 3: 100%|██████████| 10/10 [00:12<00:00,  1.30s/it]


After detect_sentence_boundaries: 5687 words
Final preprocessed word count: 5687
Percentage of words retained after cleaning: 90.86%
Completed processing batch 3

Processing batch 4 of 6


Batch 4:   0%|          | 0/10 [00:00<?, ?it/s]


Processing file: nihpp-rs3011096v1-compressed.txt
Starting preprocessing...
Initial word count: 14360
Abstract extracted. Abstract word count: 252
Main text word count: 1516
After reassemble_hyphenated_words: 1514 words
After remove_figures_tables: 1514 words
After remove_citations: 1514 words
After remove_urls: 1514 words
After remove_emails: 1514 words
After remove_numerical_references: 1514 words
After remove_headers: 1514 words
After remove_metadata: 1514 words
After remove_institution_names: 1514 words
After remove_copyright_info: 1514 words
After remove_doi_and_journal_info: 1514 words
After remove_artifacts: 1514 words
After removing extra whitespace: 1514 words
After handle_special_characters: 1511 words


Batch 4:  10%|█         | 1/10 [00:00<00:03,  2.46it/s]

After detect_sentence_boundaries: 1511 words
Final preprocessed word count: 1511
Percentage of words retained after cleaning: 10.52%

Processing file: nihms-1914935.txt
Starting preprocessing...
Initial word count: 12092
Abstract extracted. Abstract word count: 251
Main text word count: 1736
After reassemble_hyphenated_words: 1734 words
After remove_figures_tables: 1734 words
After remove_citations: 1699 words
After remove_urls: 1699 words
After remove_emails: 1699 words
After remove_numerical_references: 1699 words
After remove_headers: 1699 words
After remove_metadata: 1699 words
After remove_institution_names: 1699 words
After remove_copyright_info: 1699 words
After remove_doi_and_journal_info: 1699 words
After remove_artifacts: 1699 words
After removing extra whitespace: 1699 words
After handle_special_characters: 1682 words


Batch 4:  20%|██        | 2/10 [00:00<00:03,  2.34it/s]

After detect_sentence_boundaries: 1682 words
Final preprocessed word count: 1682
Percentage of words retained after cleaning: 13.91%

Processing file: genes-14-00074.txt
Starting preprocessing...
Initial word count: 10641
Abstract extracted. Abstract word count: 173
Main text word count: 10281
After reassemble_hyphenated_words: 10239 words
After remove_figures_tables: 10239 words
After remove_citations: 10089 words
After remove_urls: 10038 words
After remove_emails: 10038 words
After remove_numerical_references: 10038 words
After remove_headers: 10038 words
After remove_metadata: 10038 words
After remove_institution_names: 10038 words
After remove_copyright_info: 10038 words
After remove_doi_and_journal_info: 10038 words
After remove_artifacts: 10038 words
After removing extra whitespace: 10038 words
After handle_special_characters: 10012 words


Batch 4:  30%|███       | 3/10 [00:05<00:17,  2.51s/it]

After detect_sentence_boundaries: 10012 words
Final preprocessed word count: 10012
Percentage of words retained after cleaning: 94.09%

Processing file: 1-s2.0-S1350946220300707-main-compressed.txt
Starting preprocessing...
Initial word count: 34597
Abstract extracted. Abstract word count: 249
Main text word count: 34176
After reassemble_hyphenated_words: 34130 words
After remove_figures_tables: 34127 words
After remove_citations: 32814 words
After remove_urls: 32800 words
After remove_emails: 32795 words
After remove_numerical_references: 32795 words
After remove_headers: 32795 words
After remove_metadata: 32795 words
After remove_institution_names: 32795 words
After remove_copyright_info: 32795 words
After remove_doi_and_journal_info: 32795 words
After remove_artifacts: 32795 words
After removing extra whitespace: 32795 words
After handle_special_characters: 32745 words


Batch 4:  40%|████      | 4/10 [00:14<00:29,  4.88s/it]

After detect_sentence_boundaries: 32745 words
Final preprocessed word count: 32745
Percentage of words retained after cleaning: 94.65%

Processing file: Acta Ophthalmologica - 2019 - Holtan - Inherited retinal disease in Norway   a characterization of current clinical and.txt
Starting preprocessing...
Initial word count: 7041
Abstract extracted. Abstract word count: 226
Main text word count: 6707
After reassemble_hyphenated_words: 6600 words
After remove_figures_tables: 6599 words
After remove_citations: 6426 words
After remove_urls: 6318 words
After remove_emails: 6312 words
After remove_numerical_references: 6312 words
After remove_headers: 6312 words
After remove_metadata: 6312 words
After remove_institution_names: 6312 words
After remove_copyright_info: 6312 words
After remove_doi_and_journal_info: 6312 words
After remove_artifacts: 6312 words
After removing extra whitespace: 6312 words
After handle_special_characters: 6295 words


Batch 4:  50%|█████     | 5/10 [00:15<00:18,  3.61s/it]

After detect_sentence_boundaries: 6295 words
Final preprocessed word count: 6295
Percentage of words retained after cleaning: 89.40%

Processing file: ijms-22-07207.txt
Starting preprocessing...
Initial word count: 8901
Abstract extracted. Abstract word count: 193
Main text word count: 8361
After reassemble_hyphenated_words: 8326 words
After remove_figures_tables: 8326 words
After remove_citations: 8168 words
After remove_urls: 8116 words
After remove_emails: 8116 words
After remove_numerical_references: 8116 words
After remove_headers: 8116 words
After remove_metadata: 8116 words
After remove_institution_names: 8116 words
After remove_copyright_info: 8116 words
After remove_doi_and_journal_info: 8116 words
After remove_artifacts: 8116 words
After removing extra whitespace: 8116 words
After handle_special_characters: 8102 words


Batch 4:  60%|██████    | 6/10 [00:17<00:11,  2.96s/it]

After detect_sentence_boundaries: 8102 words
Final preprocessed word count: 8102
Percentage of words retained after cleaning: 91.02%

Processing file: cells-12-02579-compressed.txt
Starting preprocessing...
Initial word count: 14763
Abstract extracted. Abstract word count: 224
Main text word count: 14370
After reassemble_hyphenated_words: 14227 words
After remove_figures_tables: 14227 words
After remove_citations: 14037 words
After remove_urls: 13901 words
After remove_emails: 13901 words
After remove_numerical_references: 13901 words
After remove_headers: 13901 words
After remove_metadata: 13901 words
After remove_institution_names: 13901 words
After remove_copyright_info: 13901 words
After remove_doi_and_journal_info: 13901 words
After remove_artifacts: 13901 words
After removing extra whitespace: 13901 words
After handle_special_characters: 13860 words


Batch 4:  70%|███████   | 7/10 [00:20<00:08,  2.92s/it]

After detect_sentence_boundaries: 13860 words
Final preprocessed word count: 13860
Percentage of words retained after cleaning: 93.88%

Processing file: nihms-1567493.txt
Starting preprocessing...
Initial word count: 5167
Abstract extracted. Abstract word count: 139
Main text word count: 4958
After reassemble_hyphenated_words: 4952 words
After remove_figures_tables: 4952 words
After remove_citations: 4673 words
After remove_urls: 4673 words
After remove_emails: 4669 words
After remove_numerical_references: 4669 words
After remove_headers: 4669 words
After remove_metadata: 4669 words
After remove_institution_names: 4669 words
After remove_copyright_info: 4669 words
After remove_doi_and_journal_info: 4669 words
After remove_artifacts: 4669 words
After removing extra whitespace: 4669 words
After handle_special_characters: 4656 words


Batch 4:  80%|████████  | 8/10 [00:21<00:04,  2.29s/it]

After detect_sentence_boundaries: 4656 words
Final preprocessed word count: 4656
Percentage of words retained after cleaning: 90.11%

Processing file: NRR-18-701.txt
Starting preprocessing...
Initial word count: 9592
Abstract extracted. Abstract word count: 238
Main text word count: 2979
After reassemble_hyphenated_words: 2973 words
After remove_figures_tables: 2973 words
After remove_citations: 2971 words
After remove_urls: 2965 words
After remove_emails: 2965 words
After remove_numerical_references: 2965 words
After remove_headers: 2965 words
After remove_metadata: 2965 words
After remove_institution_names: 2965 words
After remove_copyright_info: 2965 words
After remove_doi_and_journal_info: 2965 words
After remove_artifacts: 2965 words
After removing extra whitespace: 2965 words
After handle_special_characters: 2961 words


Batch 4:  90%|█████████ | 9/10 [00:21<00:01,  1.80s/it]

After detect_sentence_boundaries: 2961 words
Final preprocessed word count: 2961
Percentage of words retained after cleaning: 30.87%

Processing file: 13023_2023_Article_2798.txt
Starting preprocessing...
Initial word count: 8984
Abstract extracted. Abstract word count: 249
Main text word count: 8349
After reassemble_hyphenated_words: 8321 words
After remove_figures_tables: 8319 words
After remove_citations: 7897 words
After remove_urls: 7889 words
After remove_emails: 7889 words
After remove_numerical_references: 7889 words
After remove_headers: 7818 words
After remove_metadata: 7814 words
After remove_institution_names: 7814 words
After remove_copyright_info: 7814 words
After remove_doi_and_journal_info: 7814 words
After remove_artifacts: 7814 words
After removing extra whitespace: 7814 words
After handle_special_characters: 7791 words


Batch 4: 100%|██████████| 10/10 [00:23<00:00,  2.35s/it]


After detect_sentence_boundaries: 7791 words
Final preprocessed word count: 7791
Percentage of words retained after cleaning: 86.72%
Completed processing batch 4

Processing batch 5 of 6


Batch 5:   0%|          | 0/10 [00:00<?, ?it/s]


Processing file: PIIS0039625723001030.txt
Starting preprocessing...
Initial word count: 13222
Abstract extracted. Abstract word count: 242
Main text word count: 5202
After reassemble_hyphenated_words: 5189 words
After remove_figures_tables: 5189 words
After remove_citations: 4701 words
After remove_urls: 4645 words
After remove_emails: 4645 words
After remove_numerical_references: 4645 words
After remove_headers: 4645 words
After remove_metadata: 4645 words
After remove_institution_names: 4645 words
After remove_copyright_info: 4645 words
After remove_doi_and_journal_info: 4645 words
After remove_artifacts: 4645 words
After removing extra whitespace: 4645 words
After handle_special_characters: 4633 words


Batch 5:  10%|█         | 1/10 [00:01<00:09,  1.10s/it]

After detect_sentence_boundaries: 4633 words
Final preprocessed word count: 4633
Percentage of words retained after cleaning: 35.04%

Processing file: diagnostics-13-00850.txt
Starting preprocessing...
Initial word count: 5979
Abstract extracted. Abstract word count: 203
Main text word count: 5540
After reassemble_hyphenated_words: 5530 words
After remove_figures_tables: 5530 words
After remove_citations: 5446 words
After remove_urls: 5423 words
After remove_emails: 5423 words
After remove_numerical_references: 5423 words
After remove_headers: 5423 words
After remove_metadata: 5423 words
After remove_institution_names: 5423 words
After remove_copyright_info: 5423 words
After remove_doi_and_journal_info: 5423 words
After remove_artifacts: 5423 words
After removing extra whitespace: 5423 words
After handle_special_characters: 5413 words


Batch 5:  20%|██        | 2/10 [00:02<00:08,  1.11s/it]

After detect_sentence_boundaries: 5413 words
Final preprocessed word count: 5413
Percentage of words retained after cleaning: 90.53%

Processing file: nihms880229.txt
Starting preprocessing...
Initial word count: 11916
Abstract extracted. Abstract word count: 232
Main text word count: 11076
After reassemble_hyphenated_words: 11064 words
After remove_figures_tables: 11063 words
After remove_citations: 10618 words
After remove_urls: 10609 words
After remove_emails: 10609 words
After remove_numerical_references: 10609 words
After remove_headers: 10609 words
After remove_metadata: 10609 words
After remove_institution_names: 10609 words
After remove_copyright_info: 10609 words
After remove_doi_and_journal_info: 10609 words
After remove_artifacts: 10609 words
After removing extra whitespace: 10609 words
After handle_special_characters: 10609 words


Batch 5:  30%|███       | 3/10 [00:04<00:10,  1.54s/it]

After detect_sentence_boundaries: 10609 words
Final preprocessed word count: 10609
Percentage of words retained after cleaning: 89.03%

Processing file: 41598_2021_Article_81093.txt
Starting preprocessing...
Initial word count: 9355
Abstract extracted. Abstract word count: 258
Main text word count: 4741
After reassemble_hyphenated_words: 4736 words
After remove_figures_tables: 4736 words
After remove_citations: 4732 words
After remove_urls: 4629 words
After remove_emails: 4629 words
After remove_numerical_references: 4629 words
After remove_headers: 4629 words
After remove_metadata: 4629 words
After remove_institution_names: 4629 words
After remove_copyright_info: 4609 words
After remove_doi_and_journal_info: 4609 words
After remove_artifacts: 4609 words
After removing extra whitespace: 4609 words
After handle_special_characters: 4541 words


Batch 5:  40%|████      | 4/10 [00:05<00:07,  1.31s/it]

After detect_sentence_boundaries: 4541 words
Final preprocessed word count: 4541
Percentage of words retained after cleaning: 48.54%

Processing file: main.txt
Starting preprocessing...
Initial word count: 10486
Abstract extracted. Abstract word count: 273
Main text word count: 10212
After reassemble_hyphenated_words: 10088 words
After remove_figures_tables: 10087 words
After remove_citations: 10082 words
After remove_urls: 10014 words
After remove_emails: 10014 words
After remove_numerical_references: 10014 words
After remove_headers: 10014 words
After remove_metadata: 10010 words
After remove_institution_names: 10010 words
After remove_copyright_info: 10010 words
After remove_doi_and_journal_info: 10010 words
After remove_artifacts: 10010 words
After removing extra whitespace: 10010 words
After handle_special_characters: 9993 words


Batch 5:  50%|█████     | 5/10 [00:07<00:07,  1.59s/it]

After detect_sentence_boundaries: 9993 words
Final preprocessed word count: 9993
Percentage of words retained after cleaning: 95.30%

Processing file: nihms-1933615.txt
Starting preprocessing...
Initial word count: 4011
Abstract extracted. Abstract word count: 146
Main text word count: 3796
After reassemble_hyphenated_words: 3793 words
After remove_figures_tables: 3793 words
After remove_citations: 3570 words
After remove_urls: 3552 words
After remove_emails: 3549 words
After remove_numerical_references: 3549 words
After remove_headers: 3549 words
After remove_metadata: 3549 words
After remove_institution_names: 3549 words
After remove_copyright_info: 3549 words
After remove_doi_and_journal_info: 3549 words
After remove_artifacts: 3549 words
After removing extra whitespace: 3549 words
After handle_special_characters: 3538 words


Batch 5:  60%|██████    | 6/10 [00:08<00:05,  1.28s/it]

After detect_sentence_boundaries: 3538 words
Final preprocessed word count: 3538
Percentage of words retained after cleaning: 88.21%

Processing file: nihms-1685213.txt
Starting preprocessing...
Initial word count: 10197
Abstract extracted. Abstract word count: 212
Main text word count: 9641
After reassemble_hyphenated_words: 9627 words
After remove_figures_tables: 9627 words
After remove_citations: 8539 words
After remove_urls: 8539 words
After remove_emails: 8539 words
After remove_numerical_references: 8539 words
After remove_headers: 8539 words
After remove_metadata: 8539 words
After remove_institution_names: 8539 words
After remove_copyright_info: 8539 words
After remove_doi_and_journal_info: 8539 words
After remove_artifacts: 8539 words
After removing extra whitespace: 8539 words
After handle_special_characters: 8531 words


Batch 5:  70%|███████   | 7/10 [00:09<00:04,  1.47s/it]

After detect_sentence_boundaries: 8531 words
Final preprocessed word count: 8531
Percentage of words retained after cleaning: 83.66%

Processing file: diagnostics-13-02413-compressed.txt
Starting preprocessing...
Initial word count: 16898
Abstract extracted. Abstract word count: 186
Main text word count: 16455
After reassemble_hyphenated_words: 16376 words
After remove_figures_tables: 16376 words
After remove_citations: 16136 words
After remove_urls: 16124 words
After remove_emails: 16124 words
After remove_numerical_references: 16124 words
After remove_headers: 16124 words
After remove_metadata: 16124 words
After remove_institution_names: 16124 words
After remove_copyright_info: 16117 words
After remove_doi_and_journal_info: 16117 words
After remove_artifacts: 16117 words
After removing extra whitespace: 16117 words
After handle_special_characters: 16064 words


Batch 5:  80%|████████  | 8/10 [00:13<00:04,  2.05s/it]

After detect_sentence_boundaries: 16064 words
Final preprocessed word count: 16064
Percentage of words retained after cleaning: 95.06%

Processing file: 41525_2021_Article_180.txt
Starting preprocessing...
Initial word count: 6287
Abstract extracted. Abstract word count: 248
Main text word count: 6025
After reassemble_hyphenated_words: 6000 words
After remove_figures_tables: 6000 words
After remove_citations: 5970 words
After remove_urls: 5890 words
After remove_emails: 5887 words
After remove_numerical_references: 5887 words
After remove_headers: 5887 words
After remove_metadata: 5887 words
After remove_institution_names: 5887 words
After remove_copyright_info: 5872 words
After remove_doi_and_journal_info: 5872 words
After remove_artifacts: 5872 words
After removing extra whitespace: 5872 words
After handle_special_characters: 5872 words


Batch 5:  90%|█████████ | 9/10 [00:14<00:01,  1.80s/it]

After detect_sentence_boundaries: 5872 words
Final preprocessed word count: 5872
Percentage of words retained after cleaning: 93.40%

Processing file: fgene-13-858556.txt
Starting preprocessing...
Initial word count: 6769
Abstract extracted. Abstract word count: 256
Main text word count: 5521
After reassemble_hyphenated_words: 5518 words
After remove_figures_tables: 5518 words
After remove_citations: 5306 words
After remove_urls: 5285 words
After remove_emails: 5285 words
After remove_numerical_references: 5285 words
After remove_headers: 5285 words
After remove_metadata: 5285 words
After remove_institution_names: 5285 words
After remove_copyright_info: 5256 words
After remove_doi_and_journal_info: 5256 words
After remove_artifacts: 5256 words
After removing extra whitespace: 5256 words
After handle_special_characters: 5255 words


Batch 5: 100%|██████████| 10/10 [00:15<00:00,  1.55s/it]


After detect_sentence_boundaries: 5255 words
Final preprocessed word count: 5255
Percentage of words retained after cleaning: 77.63%
Completed processing batch 5

Processing batch 6 of 6


Batch 6:   0%|          | 0/1 [00:00<?, ?it/s]


Processing file: biomolecules-13-00271.txt
Starting preprocessing...
Initial word count: 31269
Abstract extracted. Abstract word count: 118
Main text word count: 30950
After reassemble_hyphenated_words: 30866 words
After remove_figures_tables: 30866 words
After remove_citations: 30230 words
After remove_urls: 30212 words
After remove_emails: 30212 words
After remove_numerical_references: 30212 words
After remove_headers: 30212 words
After remove_metadata: 30212 words
After remove_institution_names: 30212 words
After remove_copyright_info: 30212 words
After remove_doi_and_journal_info: 30212 words
After remove_artifacts: 30212 words
After removing extra whitespace: 30212 words
After handle_special_characters: 30125 words


Batch 6: 100%|██████████| 1/1 [00:06<00:00,  6.57s/it]

After detect_sentence_boundaries: 30125 words
Final preprocessed word count: 30125
Percentage of words retained after cleaning: 96.34%
Completed processing batch 6
All batches processed.
Preprocessing complete.



