In [12]:
# Import libraries
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
import nltk
from scispacy.linking import EntityLinker
import scispacy
from tqdm import tqdm
import re
import unicodedata
import os
import logging
import chardet
from scipy.spatial.distance import cosine

%matplotlib inline
import matplotlib.pyplot as plt

In [13]:
# nltk.download('punkt')
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

nlp = spacy.load("en_core_sci_md")
nlp.add_pipe("scispacy_linker", last=True)

print("Setup completed.")

Setup completed.


In [14]:
def reassemble_hyphenated_words(text):
    return re.sub(r'(\w+)-\s*\n(\w+)', r'\1\2', text)

def remove_figures_tables(text):
    return re.sub(r'\b(figures?|tables?)\b', '', text)

def remove_numerical_references(text):
    return re.sub(r'\[\d+\]', '', text)

def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def remove_emails(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    return re.sub(email_pattern, '', text, flags=re.IGNORECASE)

def remove_citations(text):
    patterns = [
        r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\s+[A-Z][a-z]+\s+et\s+al\.',
        r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\s+et\s+al\.',
        r'\b[A-Z][a-z]+\s+et\s+al\.',
        r'\(.*?et al\..*?\d{4}.*?\)',
        r'\[.*?\]',
        r'\(\d{4}[a-z]?(?:,\s*\d{4}[a-z]?)*\)',
        r'^.*?\d{4};.*?:\s*\d+.*?$',
        r'^.*?\d{4};.*?:\s*\d+.*?$',  # Matches journal info like "2024;258: 119– 129"
    ]

    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE | re.IGNORECASE)
    return text

def remove_headers(text):
    # Remove lines that are all uppercase and end with a colon
    text = re.sub(r'^[A-Z\s]+:$', '', text, flags=re.MULTILINE)
    # Remove lines that start with bullet points
    text = re.sub(r'^\s*•.*$', '', text, flags=re.MULTILINE)
    return text

def remove_metadata(text):
    # Remove headers, copyright info, DOI, received/accepted dates
    patterns = [
        r'^.*?©Copyright.*$',
        r'^DOI:.*$',
        r'^Received:.*$',
        r'^Accepted:.*$',
        r'^Address for Correspondence:.*$',
        r'^E-mail:.*$',
        r'^ORCID-ID:.*$',
        r'^\s*\d+\s*$',  # Page numbers
        r'^.*?ORCID:.*$',
        r'^Cite this article as:.*$',
        r'\[\s*[^\w\s]*\s*\]'
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return text

def remove_institution_names(text):
    # Remove institution names (this is a simplified approach and may need refinement)
    pattern = r'\*+[A-Z][A-Za-z\s,]+(University|Institute|Hospital|Clinic|Department|Faculty)[^\n]*'
    return re.sub(pattern, '', text, flags=re.MULTILINE)

def remove_copyright_info(text):
    patterns = [
        r'^©.*$',
        r'Copyright.*$',
        r'This is an open access article.*$',
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE | re.IGNORECASE)
    return text

def remove_doi_and_journal_info(text):
    patterns = [
        r'DOI:.*$',
        r'^.*?\d{4};\d+:\d+–\d+',  # Matches journal info like "2024;258: 119– 129"
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return text

def remove_references(text):
    # Remove lines that look like references
    text = re.sub(r'^.*?\d{4};.*?:\s*\d+.*?$', '', text, flags=re.MULTILINE)
    return text

def remove_artifacts(text):
    # Remove license and DOI information
    text = re.sub(r'BY license \(.*?\)\..*?commons\.org/licenses/by/\d\.\d/\s*\)\.', '', text)
    text = re.sub(r'://doi\.org/\d+\.\d+/[^\s]+', '', text)
    text = re.sub(r'://creativecommons\.org/licenses/by/\d\.\d/', '', text)
    text = re.sub(r'://creativecommons\.org/licenses/by-\w+/\d\.\d/', '', text)
    
    
    # Remove unnecessary symbols
    text = re.sub(r'[⁎\]]', '', text)
    
    # Remove empty parentheses and brackets
    text = re.sub(r'\(\s*\)|\[\s*\]', '', text)
    
    # Remove isolated semicolons
    text = re.sub(r'\s*;\s*', ' ', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

In [15]:
def detect_sentence_boundaries(text):
    """
    Detect sentence boundaries using spaCy.
    """
    doc = nlp(text)
    spacy_sentences = [sent.text for sent in doc.sents]
    return ' '.join(spacy_sentences)

In [16]:
def tokenize_text(text):
    """
    Tokenize the text using spaCy.
    """
    doc = nlp(text)
    return [token.text for token in doc]

In [17]:
def preprocess_text(text, tokenize=False):
    try:

        print(f"Starting preprocessing. Initial text length: {len(text)}")

        # Step 1: Initial text cleaning
        text = reassemble_hyphenated_words(text)
        print(f"After reassemble_hyphenated_words: {len(text)} chars")

        text = remove_figures_tables(text)
        print(f"After remove_figures_tables: {len(text)} chars")

        text = remove_citations(text)
        print(f"After remove_citations: {len(text)} chars")

        text = remove_urls(text)
        print(f"After remove_urls: {len(text)} chars")

        text = remove_emails(text)
        print(f"After remove_emails: {len(text)} chars")

        text = remove_numerical_references(text)
        print(f"After remove_numerical_references: {len(text)} chars")

        text = remove_headers(text)
        print(f"After remove_headers: {len(text)} chars")

        text = remove_references(text)
        print(f"After remove_references: {len(text)} chars")

        text = remove_metadata(text)
        print(f"After remove_metadata: {len(text)} chars")

        text = remove_institution_names(text)
        print(f"After remove_institution_names: {len(text)} chars")
        
        text = remove_copyright_info(text)
        print(f"After remove_copyright_info: {len(text)} chars")

        text = remove_doi_and_journal_info(text)
        print(f"After remove_doi_and_journal_info: {len(text)} chars")

        text = remove_artifacts(text)
        print(f"After remove_artifacts: {len(text)} chars")

        # Step 2: Sentence boundary detection
        text = detect_sentence_boundaries(text)
        print(f"After detect_sentence_boundaries: {len(text)} chars")

        # Remove extra whitespace
        text = ' '.join(text.split())
        print(f"After removing extra whitespace: {len(text)} chars")
    
        if tokenize:
            tokens = tokenize_text(text)
            print(f"After tokenization: {len(tokens)} tokens")
            return tokens
        else:
            print("Text preprocessing complete.")
            return text.strip()

    except Exception as e:
        logging.error(f"Error in preprocess_text: {e}")
        print(f"Error in preprocess_text: {e}")
        return ""

In [18]:
def count_words(text):
    """Count the number of words in the given text."""
    return len(re.findall(r'\w+', text))

In [19]:
def split_document(text):
    # Process the document with spaCy
    doc = nlp(text)
    # Split the document into paragraphs
    paragraphs = [para.text for para in doc.sents]
    return paragraphs

In [20]:
def calculate_similarity(para, target):
    # Convert the paragraph and target text to spaCy tokens
    para_doc = nlp(para)
    target_doc = nlp(target)
    # Calculate the cosine similarity between the embeddings
    similarity = para_doc.similarity(target_doc)
    return similarity


In [21]:
def detect_abstract_section(paragraphs, start_targets=["Abstract", "Background", "Methods", "Results", "Conclusion"], end_targets=["Keywords", "Introduction"], threshold=0.9):
    start_index = None
    end_index = None
    min_abstract_length = 120  # Minimum number of words in the abstract
    max_start_search = 3  # Limit the number of paragraphs to consider if no heading is found

    for i, para in enumerate(paragraphs):
        # Check for any of the start targets
        for start_target in start_targets:
            start_similarity = calculate_similarity(para, start_target)
            if start_similarity > threshold and start_index is None:
                start_index = i
                print(f"Start of abstract detected at paragraph {i}: {para[:30]} with start target '{start_target}'...")
                break  # Found the start, no need to check other start targets

        # If no heading is found within the first few paragraphs, assume the first paragraph is the start
        if start_index is None and i < max_start_search:
            start_index = 0
            print(f"No start heading detected, assuming paragraph 0 as the start of the abstract.")
            break

        # Once start is detected, look for the end target
        if start_index is not None and i >= start_index:
            for end_target in end_targets:
                end_similarity = calculate_similarity(para, end_target)
                if end_similarity > threshold:
                    end_index = i
                    print(f"End of abstract detected at paragraph {i}: {para[:30]} with end target '{end_target}'...")
                    break  # Found the end, no need to check other end targets
            if end_index is not None:
                break

    # Extract and validate the abstract
    if start_index is not None and end_index is not None:
        abstract = "\n".join(paragraphs[start_index:end_index])
        # Check if the abstract is too short
        if count_words(abstract) < min_abstract_length:
            print("Detected abstract is too short. Rechecking...")
            abstract = ""  # Discard or handle short abstracts
    elif start_index is not None:
        # If no end target is found, but start is identified, take a few paragraphs as abstract
        abstract = "\n".join(paragraphs[start_index:start_index + max_start_search])
        print(f"Taking first {max_start_search} paragraphs as abstract since no end target found.")
    else:
        abstract = ""  # No abstract found
    return abstract


In [22]:
def extract_abstract_from_document(text):
    # Preprocess the text
    cleaned_text = preprocess_text(text)
    # Split the document into paragraphs
    paragraphs = split_document(cleaned_text)
    # Detect and extract the abstract section
    abstract = detect_abstract_section(paragraphs)
    return abstract

In [23]:
def process_files(input_folder_path: str, output_folder_path: str, batch_size: int = 10, tokenize=False) -> None:
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)
    
    files = [f for f in os.listdir(input_folder_path) if f.endswith('.txt')]
    total_files = len(files)
    
    for i in range(0, total_files, batch_size):
        batch = files[i:i+batch_size]
        print(f"\nProcessing batch {i//batch_size + 1} of {(total_files-1)//batch_size + 1}")
        
        for filename in tqdm(batch, desc=f"Batch {i//batch_size + 1}"):
            input_file_path = os.path.join(input_folder_path, filename)
            output_file_path = os.path.join(output_folder_path, f"preprocessed_{filename}")
            abstract_output_path = os.path.join(output_folder_path, f"abstract_{filename}")
            
            print(f"\nProcessing file: {filename}")
            
            try:
                # Detect file encoding
                with open(input_file_path, 'rb') as file:
                    raw_data = file.read()
                    result = chardet.detect(raw_data)
                    detected_encoding = result['encoding']
                    confidence = result['confidence']
                
                print(f"Detected encoding: {detected_encoding} (confidence: {confidence:.2f})")
                
                # Try reading with detected encoding
                try:
                    with open(input_file_path, 'r', encoding=detected_encoding) as file:
                        original_text = file.read()
                    print(f"Successfully read file with {detected_encoding} encoding.")
                except UnicodeDecodeError:
                    print(f"Failed to read with {detected_encoding}. Trying UTF-8...")
                    with open(input_file_path, 'r', encoding='utf-8') as file:
                        original_text = file.read()
                    print("Successfully read file with UTF-8 encoding.")
                
                original_word_count = count_words(original_text)
                print(f"Original word count: {original_word_count}")
                
                cleaned_text = preprocess_text(original_text, tokenize=tokenize)
                
                # Extract the abstract
                abstract = extract_abstract_from_document(original_text)
                print(f"Extracted abstract: {len(abstract)} characters")

                if tokenize:
                    cleaned_word_count = len(cleaned_text)  # cleaned_text is now a list of tokens
                    # Save tokens, one per line
                    with open(output_file_path, 'w', encoding='utf-8') as file:
                        file.write('\n'.join(cleaned_text))
                else:
                    cleaned_word_count = count_words(cleaned_text)
                    with open(output_file_path, 'w', encoding='utf-8') as file:
                        file.write(cleaned_text)
                
                # Save the abstract to a separate file
                with open(abstract_output_path, 'w', encoding='utf-8') as file:
                    file.write(abstract)
                
                print(f"Abstract saved to: {abstract_output_path}")
                print(f"Cleaned text saved to: {output_file_path}")

                # Calculate and print the percentage of cleaned text
                if original_word_count > 0:
                    percentage_retained = (cleaned_word_count / original_word_count) * 100
                    print(f"Percentage of words retained after cleaning: {percentage_retained:.2f}%")
                else:
                    print("Original text contains no words; cannot calculate percentage.")
            
            except FileNotFoundError:
                print(f"File not found: {input_file_path}")
            except Exception as e:
                print(f"Error processing file {filename}: {str(e)}")
            
            print("=" * 100)
        
        print(f"Completed processing batch {i//batch_size + 1}")
    
    print("All batches processed.")


In [24]:
if __name__ == "__main__":
    input_folder = "/mnt/data/skanda/MSc_IRD_LLM/data/data_txt"
    output_folder = "/mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed"
    abstract_folder = "/mnt/data/skanda/MSc_IRD_LLM/data/abstracts"
    process_files(input_folder, output_folder, batch_size=10, tokenize=False)
    print("Preprocessing complete.")


Processing batch 1 of 5


Batch 1:   0%|          | 0/10 [00:00<?, ?it/s]


Processing file: 10.1177_2515841420954592.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 16402
Starting preprocessing. Initial text length: 105320
After reassemble_hyphenated_words: 105250 chars
After remove_figures_tables: 105245 chars
After remove_citations: 95236 chars
After remove_urls: 93717 chars
After remove_emails: 93697 chars
After remove_numerical_references: 93697 chars
After remove_headers: 90845 chars
After remove_references: 90845 chars
After remove_metadata: 90762 chars
After remove_institution_names: 90762 chars
After remove_copyright_info: 90696 chars
After remove_doi_and_journal_info: 90631 chars
After remove_artifacts: 89740 chars
After detect_sentence_boundaries: 89755 chars
After removing extra whitespace: 89755 chars
Text preprocessing complete.
Starting preprocessing. Initial text length: 105320
After reassemble_hyphenated_words: 105250 chars
After remove_figures_tables: 105245 chars
After remove_

Batch 1:  10%|█         | 1/10 [00:07<01:03,  7.06s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 691 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_10.1177_2515841420954592.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_10.1177_2515841420954592.txt
Percentage of words retained after cleaning: 84.92%

Processing file: 13287_2023_Article_3526.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 7885
Starting preprocessing. Initial text length: 47767
After reassemble_hyphenated_words: 47713 chars
After remove_figures_tables: 47713 chars
After remove_citations: 43117 chars
After remove_urls: 42951 chars
After remove_emails: 42929 chars
After remove_numerical_references: 42929 chars
After remove_headers: 42920 chars
After remove_references: 42920 chars
After remove_metadata: 4286

Batch 1:  20%|██        | 2/10 [00:10<00:40,  5.03s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 626 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_13287_2023_Article_3526.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_13287_2023_Article_3526.txt
Percentage of words retained after cleaning: 86.79%

Processing file: ijms-22-05684.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 16125
Starting preprocessing. Initial text length: 100326
After reassemble_hyphenated_words: 100246 chars
After remove_figures_tables: 100246 chars
After remove_citations: 97199 chars
After remove_urls: 96771 chars
After remove_emails: 96702 chars
After remove_numerical_references: 96702 chars
After remove_headers: 96702 chars
After remove_references: 96702 chars
After remove_metadata: 96658 chars


Batch 1:  30%|███       | 3/10 [00:18<00:43,  6.15s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 491 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_ijms-22-05684.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_ijms-22-05684.txt
Percentage of words retained after cleaning: 96.50%

Processing file: bjophthalmol-2020-315878.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 6001
Starting preprocessing. Initial text length: 33663
After reassemble_hyphenated_words: 33622 chars
After remove_figures_tables: 33595 chars
After remove_citations: 31336 chars
After remove_urls: 31285 chars
After remove_emails: 31285 chars
After remove_numerical_references: 31285 chars
After remove_headers: 31285 chars
After remove_references: 31285 chars
After remove_metadata: 31278 chars
After remove_

Batch 1:  40%|████      | 4/10 [00:20<00:28,  4.69s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 555 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_bjophthalmol-2020-315878.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_bjophthalmol-2020-315878.txt
Percentage of words retained after cleaning: 92.38%

Processing file: iovs-63-2-11.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 6651
Starting preprocessing. Initial text length: 43804
After reassemble_hyphenated_words: 43596 chars
After remove_figures_tables: 43596 chars
After remove_citations: 40175 chars
After remove_urls: 40129 chars
After remove_emails: 40116 chars
After remove_numerical_references: 40116 chars
After remove_headers: 40116 chars
After remove_references: 40116 chars
After remove_metadata: 40069 chars
Aft

Batch 1:  50%|█████     | 5/10 [00:23<00:20,  4.11s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1515 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_iovs-63-2-11.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_iovs-63-2-11.txt
Percentage of words retained after cleaning: 89.61%

Processing file: EMMM-14-e15941-compressed.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 15063
Starting preprocessing. Initial text length: 93723
After reassemble_hyphenated_words: 93567 chars
After remove_figures_tables: 93549 chars
After remove_citations: 91797 chars
After remove_urls: 91652 chars
After remove_emails: 91585 chars
After remove_numerical_references: 91585 chars
After remove_headers: 91585 chars
After remove_references: 91585 chars
After remove_metadata: 91584 chars
After remove

Batch 1:  60%|██████    | 6/10 [00:33<00:24,  6.08s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 931 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_EMMM-14-e15941-compressed.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_EMMM-14-e15941-compressed.txt
Percentage of words retained after cleaning: 97.28%

Processing file: jci-133-171356.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 13131
Starting preprocessing. Initial text length: 80850
After reassemble_hyphenated_words: 80599 chars
After remove_figures_tables: 80599 chars
After remove_citations: 78792 chars
After remove_urls: 78240 chars
After remove_emails: 78202 chars
After remove_numerical_references: 78202 chars
After remove_headers: 78202 chars
After remove_references: 78202 chars
After remove_metadata: 78202 char

Batch 1:  70%|███████   | 7/10 [00:39<00:18,  6.11s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 652 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_jci-133-171356.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_jci-133-171356.txt
Percentage of words retained after cleaning: 95.36%

Processing file: 12886_2023_Article_2772-compressed.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 5451
Starting preprocessing. Initial text length: 32368
After reassemble_hyphenated_words: 32344 chars
After remove_figures_tables: 32344 chars
After remove_citations: 30858 chars
After remove_urls: 30795 chars
After remove_emails: 30782 chars
After remove_numerical_references: 30782 chars
After remove_headers: 30772 chars
After remove_references: 30772 chars
After remove_metadata: 30721 chars
A

Batch 1:  80%|████████  | 8/10 [00:42<00:09,  4.97s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 736 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_12886_2023_Article_2772-compressed.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_12886_2023_Article_2772-compressed.txt
Percentage of words retained after cleaning: 93.45%

Processing file: genes-12-00147.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 9530
Starting preprocessing. Initial text length: 62297
After reassemble_hyphenated_words: 62231 chars
After remove_figures_tables: 62231 chars
After remove_citations: 60363 chars
After remove_urls: 60196 chars
After remove_emails: 60124 chars
After remove_numerical_references: 60124 chars
After remove_headers: 60124 chars
After remove_references: 60124 chars
After remove_met

Batch 1:  90%|█████████ | 9/10 [00:46<00:04,  4.80s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 684 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_genes-12-00147.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_genes-12-00147.txt
Percentage of words retained after cleaning: 96.19%

Processing file: nihms-1927912.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 11442
Starting preprocessing. Initial text length: 74437
After reassemble_hyphenated_words: 74421 chars
After remove_figures_tables: 74409 chars
After remove_citations: 71374 chars
After remove_urls: 71329 chars
After remove_emails: 71303 chars
After remove_numerical_references: 71303 chars
After remove_headers: 71303 chars
After remove_references: 71303 chars
After remove_metadata: 71303 chars
After remove_institut

Batch 1: 100%|██████████| 10/10 [00:52<00:00,  5.20s/it]


No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1212 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_nihms-1927912.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_nihms-1927912.txt
Percentage of words retained after cleaning: 95.74%
Completed processing batch 1

Processing batch 2 of 5


Batch 2:   0%|          | 0/10 [00:00<?, ?it/s]


Processing file: 13023_2021_Article_2145.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 6776
Starting preprocessing. Initial text length: 44619
After reassemble_hyphenated_words: 44569 chars
After remove_figures_tables: 44569 chars
After remove_citations: 41942 chars
After remove_urls: 41770 chars
After remove_emails: 41752 chars
After remove_numerical_references: 41752 chars
After remove_headers: 41742 chars
After remove_references: 41742 chars
After remove_metadata: 41691 chars
After remove_institution_names: 41691 chars
After remove_copyright_info: 41521 chars
After remove_doi_and_journal_info: 41521 chars
After remove_artifacts: 40778 chars
After detect_sentence_boundaries: 40779 chars
After removing extra whitespace: 40779 chars
Text preprocessing complete.
Starting preprocessing. Initial text length: 44619
After reassemble_hyphenated_words: 44569 chars
After remove_figures_tables: 44569 chars
After remove_citation

Batch 2:  10%|█         | 1/10 [00:03<00:29,  3.22s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 620 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_13023_2021_Article_2145.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_13023_2021_Article_2145.txt
Percentage of words retained after cleaning: 91.12%

Processing file: jcm-12-06953.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 12975
Starting preprocessing. Initial text length: 83756
After reassemble_hyphenated_words: 83692 chars
After remove_figures_tables: 83692 chars
After remove_citations: 80369 chars
After remove_urls: 80284 chars
After remove_emails: 80257 chars
After remove_numerical_references: 80257 chars
After remove_headers: 80257 chars
After remove_references: 80257 chars
After remove_metadata: 80210 chars
Afte

Batch 2:  20%|██        | 2/10 [00:09<00:39,  4.88s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 304 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_jcm-12-06953.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_jcm-12-06953.txt
Percentage of words retained after cleaning: 95.71%

Processing file: IJO-70-2316.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 8517
Starting preprocessing. Initial text length: 57010
After reassemble_hyphenated_words: 57010 chars
After remove_figures_tables: 57010 chars
After remove_citations: 53836 chars
After remove_urls: 53697 chars
After remove_emails: 53630 chars
After remove_numerical_references: 53630 chars
After remove_headers: 53630 chars
After remove_references: 53630 chars
After remove_metadata: 53394 chars
After remove_institution_nam

Batch 2:  30%|███       | 3/10 [00:13<00:30,  4.42s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 526 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_IJO-70-2316.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_IJO-70-2316.txt
Percentage of words retained after cleaning: 91.69%

Processing file: TJO-53-44.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 9964
Starting preprocessing. Initial text length: 68253
After reassemble_hyphenated_words: 68229 chars
After remove_figures_tables: 68229 chars
After remove_citations: 65063 chars
After remove_urls: 65063 chars
After remove_emails: 65042 chars
After remove_numerical_references: 65042 chars
After remove_headers: 61548 chars
After remove_references: 61548 chars
After remove_metadata: 61159 chars
After remove_institution_names: 

Batch 2:  40%|████      | 4/10 [00:17<00:26,  4.49s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1538 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_TJO-53-44.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_TJO-53-44.txt
Percentage of words retained after cleaning: 88.18%

Processing file: biomolecules-12-00455-compressed.txt
Detected encoding: MacRoman (confidence: 0.68)
Successfully read file with MacRoman encoding.
Original word count: 15795
Starting preprocessing. Initial text length: 97326
After reassemble_hyphenated_words: 97128 chars
After remove_figures_tables: 97128 chars
After remove_citations: 95447 chars
After remove_urls: 95284 chars
After remove_emails: 95039 chars
After remove_numerical_references: 95039 chars
After remove_headers: 95039 chars
After remove_references: 95039 chars
After remove_metadata: 94991 chars
After

Batch 2:  50%|█████     | 5/10 [00:28<00:33,  6.63s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1819 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_biomolecules-12-00455-compressed.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_biomolecules-12-00455-compressed.txt
Percentage of words retained after cleaning: 97.31%

Processing file: 1-s2.0-S1350946223000447-main.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 37234
Starting preprocessing. Initial text length: 242026
After reassemble_hyphenated_words: 241561 chars
After remove_figures_tables: 241561 chars
After remove_citations: 234694 chars
After remove_urls: 234603 chars
After remove_emails: 234549 chars
After remove_numerical_references: 234549 chars
After remove_headers: 234549 chars
After remove_references: 234549 

Batch 2:  60%|██████    | 6/10 [00:48<00:45, 11.29s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1209 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_1-s2.0-S1350946223000447-main.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_1-s2.0-S1350946223000447-main.txt
Percentage of words retained after cleaning: 95.38%

Processing file: jmedgenet-2016-103837.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 6831
Starting preprocessing. Initial text length: 43778
After reassemble_hyphenated_words: 43724 chars
After remove_figures_tables: 43629 chars
After remove_citations: 40275 chars
After remove_urls: 40163 chars
After remove_emails: 40134 chars
After remove_numerical_references: 40134 chars
After remove_headers: 40134 chars
After remove_references: 40134 chars
After remove_metad

Batch 2:  70%|███████   | 7/10 [00:51<00:25,  8.64s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1075 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_jmedgenet-2016-103837.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_jmedgenet-2016-103837.txt
Percentage of words retained after cleaning: 90.94%

Processing file: 41433_2022_Article_2262.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 3707
Starting preprocessing. Initial text length: 22967
After reassemble_hyphenated_words: 22945 chars
After remove_figures_tables: 22945 chars
After remove_citations: 21575 chars
After remove_urls: 21461 chars
After remove_emails: 21439 chars
After remove_numerical_references: 21439 chars
After remove_headers: 21439 chars
After remove_references: 21439 chars
After remove_metadata: 21351 cha

Batch 2:  80%|████████  | 8/10 [00:53<00:12,  6.43s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 508 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_41433_2022_Article_2262.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_41433_2022_Article_2262.txt
Percentage of words retained after cleaning: 91.99%

Processing file: emss-80329.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 15268
Starting preprocessing. Initial text length: 100521
After reassemble_hyphenated_words: 100472 chars
After remove_figures_tables: 100472 chars
After remove_citations: 87604 chars
After remove_urls: 87604 chars
After remove_emails: 87585 chars
After remove_numerical_references: 87585 chars
After remove_headers: 87585 chars
After remove_references: 87585 chars
After remove_metadata: 87585 chars
Aft

Batch 2:  90%|█████████ | 9/10 [01:03<00:07,  7.55s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 809 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_emss-80329.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_emss-80329.txt
Percentage of words retained after cleaning: 85.56%

Processing file: nihms-1747988.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 2276
Starting preprocessing. Initial text length: 14644
After reassemble_hyphenated_words: 14642 chars
After remove_figures_tables: 14642 chars
After remove_citations: 13714 chars
After remove_urls: 13679 chars
After remove_emails: 13652 chars
After remove_numerical_references: 13652 chars
After remove_headers: 13652 chars
After remove_references: 13652 chars
After remove_metadata: 13652 chars
After remove_institution_names

Batch 2: 100%|██████████| 10/10 [01:04<00:00,  6.46s/it]


No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1658 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_nihms-1747988.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_nihms-1747988.txt
Percentage of words retained after cleaning: 92.93%
Completed processing batch 2

Processing batch 3 of 5


Batch 3:   0%|          | 0/10 [00:00<?, ?it/s]


Processing file: TJP-600-4623.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 7533
Starting preprocessing. Initial text length: 49627
After reassemble_hyphenated_words: 49579 chars
After remove_figures_tables: 49573 chars
After remove_citations: 48439 chars
After remove_urls: 47978 chars
After remove_emails: 47948 chars
After remove_numerical_references: 47948 chars
After remove_headers: 47948 chars
After remove_references: 47948 chars
After remove_metadata: 47948 chars
After remove_institution_names: 47948 chars
After remove_copyright_info: 46115 chars
After remove_doi_and_journal_info: 46115 chars
After remove_artifacts: 46067 chars
After detect_sentence_boundaries: 46082 chars
After removing extra whitespace: 46082 chars
Text preprocessing complete.
Starting preprocessing. Initial text length: 49627
After reassemble_hyphenated_words: 49579 chars
After remove_figures_tables: 49573 chars
After remove_citations: 48439 ch

Batch 3:  10%|█         | 1/10 [00:03<00:30,  3.41s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1239 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_TJP-600-4623.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_TJP-600-4623.txt
Percentage of words retained after cleaning: 91.50%

Processing file: main (1).txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 6723
Starting preprocessing. Initial text length: 44004
After reassemble_hyphenated_words: 43486 chars
After remove_figures_tables: 43486 chars
After remove_citations: 41181 chars
After remove_urls: 41097 chars
After remove_emails: 41069 chars
After remove_numerical_references: 41069 chars
After remove_headers: 39457 chars
After remove_references: 39457 chars
After remove_metadata: 39457 chars
After remove_institution_names

Batch 3:  20%|██        | 2/10 [00:06<00:25,  3.20s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 684 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_main (1).txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_main (1).txt
Percentage of words retained after cleaning: 88.12%

Processing file: ijms-22-04534.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 13533
Starting preprocessing. Initial text length: 86416
After reassemble_hyphenated_words: 86296 chars
After remove_figures_tables: 86296 chars
After remove_citations: 84017 chars
After remove_urls: 83900 chars
After remove_emails: 83573 chars
After remove_numerical_references: 83573 chars
After remove_headers: 83573 chars
After remove_references: 83573 chars
After remove_metadata: 83527 chars
After remove_institution_names: 8

Batch 3:  30%|███       | 3/10 [00:12<00:32,  4.59s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 939 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_ijms-22-04534.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_ijms-22-04534.txt
Percentage of words retained after cleaning: 96.35%

Processing file: 41436_2020_Article_759.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 6909
Starting preprocessing. Initial text length: 44036
After reassemble_hyphenated_words: 43988 chars
After remove_figures_tables: 43988 chars
After remove_citations: 41017 chars
After remove_urls: 40797 chars
After remove_emails: 40764 chars
After remove_numerical_references: 40764 chars
After remove_headers: 40764 chars
After remove_references: 40764 chars
After remove_metadata: 40705 chars
After remove_in

Batch 3:  40%|████      | 4/10 [00:15<00:24,  4.02s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 851 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_41436_2020_Article_759.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_41436_2020_Article_759.txt
Percentage of words retained after cleaning: 91.61%

Processing file: JCO-34-80.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 6033
Starting preprocessing. Initial text length: 35532
After reassemble_hyphenated_words: 35532 chars
After remove_figures_tables: 35527 chars
After remove_citations: 33828 chars
After remove_urls: 33805 chars
After remove_emails: 33741 chars
After remove_numerical_references: 33741 chars
After remove_headers: 33741 chars
After remove_references: 33741 chars
After remove_metadata: 33734 chars
After remo

Batch 3:  50%|█████     | 5/10 [00:18<00:17,  3.52s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1014 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_JCO-34-80.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_JCO-34-80.txt
Percentage of words retained after cleaning: 94.63%

Processing file: opth-16-1127.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 6378
Starting preprocessing. Initial text length: 46873
After reassemble_hyphenated_words: 46857 chars
After remove_figures_tables: 46844 chars
After remove_citations: 44304 chars
After remove_urls: 43258 chars
After remove_emails: 43237 chars
After remove_numerical_references: 43237 chars
After remove_headers: 43237 chars
After remove_references: 43237 chars
After remove_metadata: 43188 chars
After remove_institution_names: 

Batch 3:  60%|██████    | 6/10 [00:21<00:13,  3.32s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1265 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_opth-16-1127.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_opth-16-1127.txt
Percentage of words retained after cleaning: 89.78%

Processing file: MGG3-9-e1663.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 6325
Starting preprocessing. Initial text length: 39845
After reassemble_hyphenated_words: 39668 chars
After remove_figures_tables: 39668 chars
After remove_citations: 39083 chars
After remove_urls: 38381 chars
After remove_emails: 38355 chars
After remove_numerical_references: 38355 chars
After remove_headers: 38355 chars
After remove_references: 38355 chars
After remove_metadata: 38331 chars
After remove_institution_n

Batch 3:  70%|███████   | 7/10 [00:24<00:09,  3.18s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 480 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_MGG3-9-e1663.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_MGG3-9-e1663.txt
Percentage of words retained after cleaning: 93.60%

Processing file: main copy.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 7299
Starting preprocessing. Initial text length: 45174
After reassemble_hyphenated_words: 45092 chars
After remove_figures_tables: 45092 chars
After remove_citations: 42909 chars
After remove_urls: 42684 chars
After remove_emails: 42656 chars
After remove_numerical_references: 42656 chars
After remove_headers: 42656 chars
After remove_references: 42656 chars
After remove_metadata: 42623 chars
After remove_institution_names

Batch 3:  80%|████████  | 8/10 [00:27<00:06,  3.28s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 577 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_main copy.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_main copy.txt
Percentage of words retained after cleaning: 93.56%

Processing file: 1-s2.0-S1350946221000367-main.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 22884
Starting preprocessing. Initial text length: 151073
After reassemble_hyphenated_words: 150788 chars
After remove_figures_tables: 150788 chars
After remove_citations: 147811 chars
After remove_urls: 142329 chars
After remove_emails: 142306 chars
After remove_numerical_references: 142306 chars
After remove_headers: 142306 chars
After remove_references: 142306 chars
After remove_metadata: 142305 chars
After

Batch 3:  90%|█████████ | 9/10 [00:41<00:06,  6.41s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1161 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_1-s2.0-S1350946221000367-main.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_1-s2.0-S1350946221000367-main.txt
Percentage of words retained after cleaning: 91.60%

Processing file: fphar-12-654445.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 7165
Starting preprocessing. Initial text length: 46833
After reassemble_hyphenated_words: 46807 chars
After remove_figures_tables: 46807 chars
After remove_citations: 45035 chars
After remove_urls: 44883 chars
After remove_emails: 44847 chars
After remove_numerical_references: 44847 chars
After remove_headers: 44847 chars
After remove_references: 44847 chars
After remove_metadata: 4

Batch 3: 100%|██████████| 10/10 [00:44<00:00,  4.47s/it]


No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1598 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_fphar-12-654445.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_fphar-12-654445.txt
Percentage of words retained after cleaning: 93.90%
Completed processing batch 3

Processing batch 4 of 5


Batch 4:   0%|          | 0/10 [00:00<?, ?it/s]


Processing file: nihpp-rs3011096v1-compressed.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 15938
Starting preprocessing. Initial text length: 99071
After reassemble_hyphenated_words: 99031 chars
After remove_figures_tables: 99031 chars
After remove_citations: 98527 chars
After remove_urls: 96934 chars
After remove_emails: 96914 chars
After remove_numerical_references: 96914 chars
After remove_headers: 96914 chars
After remove_references: 96914 chars
After remove_metadata: 96908 chars
After remove_institution_names: 96908 chars
After remove_copyright_info: 96908 chars
After remove_doi_and_journal_info: 96908 chars
After remove_artifacts: 96568 chars
After detect_sentence_boundaries: 96572 chars
After removing extra whitespace: 96572 chars
Text preprocessing complete.
Starting preprocessing. Initial text length: 99071
After reassemble_hyphenated_words: 99031 chars
After remove_figures_tables: 99031 chars
After remove_ci

Batch 4:  10%|█         | 1/10 [00:07<01:09,  7.71s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1789 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_nihpp-rs3011096v1-compressed.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_nihpp-rs3011096v1-compressed.txt
Percentage of words retained after cleaning: 97.82%

Processing file: nihms-1914935.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 13403
Starting preprocessing. Initial text length: 83954
After reassemble_hyphenated_words: 83928 chars
After remove_figures_tables: 83915 chars
After remove_citations: 81766 chars
After remove_urls: 81469 chars
After remove_emails: 81455 chars
After remove_numerical_references: 81455 chars
After remove_headers: 81455 chars
After remove_references: 81455 chars
After remove_metadata: 8145

Batch 4:  20%|██        | 2/10 [00:14<00:56,  7.10s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1571 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_nihms-1914935.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_nihms-1914935.txt
Percentage of words retained after cleaning: 96.96%

Processing file: genes-14-00074.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 12865
Starting preprocessing. Initial text length: 76443
After reassemble_hyphenated_words: 76353 chars
After remove_figures_tables: 76353 chars
After remove_citations: 74421 chars
After remove_urls: 73999 chars
After remove_emails: 73978 chars
After remove_numerical_references: 73978 chars
After remove_headers: 73978 chars
After remove_references: 73978 chars
After remove_metadata: 73926 chars
After remove_institut

Batch 4:  30%|███       | 3/10 [00:20<00:45,  6.54s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 528 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_genes-14-00074.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_genes-14-00074.txt
Percentage of words retained after cleaning: 96.69%

Processing file: Acta Ophthalmologica - 2019 - Holtan - Inherited retinal disease in Norway   a characterization of current clinical and.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 8040
Starting preprocessing. Initial text length: 52222
After reassemble_hyphenated_words: 52002 chars
After remove_figures_tables: 51997 chars
After remove_citations: 50819 chars
After remove_urls: 49818 chars
After remove_emails: 49787 chars
After remove_numerical_references: 49787 chars
After remove_headers: 

Batch 4:  40%|████      | 4/10 [00:24<00:32,  5.44s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 833 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_Acta Ophthalmologica - 2019 - Holtan - Inherited retinal disease in Norway   a characterization of current clinical and.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_Acta Ophthalmologica - 2019 - Holtan - Inherited retinal disease in Norway   a characterization of current clinical and.txt
Percentage of words retained after cleaning: 93.37%

Processing file: ijms-22-07207.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 9966
Starting preprocessing. Initial text length: 63806
After reassemble_hyphenated_words: 63734 chars
After remove_figures_tables: 63734 chars
After remove_citations: 61915 chars
After remove_urls: 61562 char

Batch 4:  50%|█████     | 5/10 [00:31<00:30,  6.18s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 721 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_ijms-22-07207.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_ijms-22-07207.txt
Percentage of words retained after cleaning: 95.50%

Processing file: cells-12-02579-compressed.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 16218
Starting preprocessing. Initial text length: 103611
After reassemble_hyphenated_words: 103323 chars
After remove_figures_tables: 103323 chars
After remove_citations: 101045 chars
After remove_urls: 100124 chars
After remove_emails: 100088 chars
After remove_numerical_references: 100088 chars
After remove_headers: 100088 chars
After remove_references: 100088 chars
After remove_metadata: 100031 chars
A

Batch 4:  60%|██████    | 6/10 [00:38<00:26,  6.63s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 268 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_cells-12-02579-compressed.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_cells-12-02579-compressed.txt
Percentage of words retained after cleaning: 95.57%

Processing file: nihms-1567493.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 5719
Starting preprocessing. Initial text length: 37869
After reassemble_hyphenated_words: 37857 chars
After remove_figures_tables: 37857 chars
After remove_citations: 34796 chars
After remove_urls: 34796 chars
After remove_emails: 34776 chars
After remove_numerical_references: 34776 chars
After remove_headers: 34776 chars
After remove_references: 34776 chars
After remove_metadata: 34776 chars


Batch 4:  70%|███████   | 7/10 [00:41<00:16,  5.35s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 735 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_nihms-1567493.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_nihms-1567493.txt
Percentage of words retained after cleaning: 91.27%

Processing file: NRR-18-701.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 10427
Starting preprocessing. Initial text length: 69425
After reassemble_hyphenated_words: 69374 chars
After remove_figures_tables: 69374 chars
After remove_citations: 66934 chars
After remove_urls: 66630 chars
After remove_emails: 66576 chars
After remove_numerical_references: 66576 chars
After remove_headers: 66576 chars
After remove_references: 66576 chars
After remove_metadata: 66576 chars
After remove_institution_n

Batch 4:  80%|████████  | 8/10 [00:46<00:10,  5.29s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 588 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_NRR-18-701.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_NRR-18-701.txt
Percentage of words retained after cleaning: 94.21%

Processing file: 13023_2023_Article_2798.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 9875
Starting preprocessing. Initial text length: 65921
After reassemble_hyphenated_words: 65855 chars
After remove_figures_tables: 65842 chars
After remove_citations: 60900 chars
After remove_urls: 60793 chars
After remove_emails: 60763 chars
After remove_numerical_references: 60763 chars
After remove_headers: 60228 chars
After remove_references: 60228 chars
After remove_metadata: 60179 chars
After remove_institu

Batch 4:  90%|█████████ | 9/10 [00:51<00:05,  5.10s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 736 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_13023_2023_Article_2798.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_13023_2023_Article_2798.txt
Percentage of words retained after cleaning: 88.61%

Processing file: PIIS0039625723001030.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 16619
Starting preprocessing. Initial text length: 103465
After reassemble_hyphenated_words: 103262 chars
After remove_figures_tables: 103262 chars
After remove_citations: 94979 chars
After remove_urls: 94366 chars
After remove_emails: 94347 chars
After remove_numerical_references: 94347 chars
After remove_headers: 94347 chars
After remove_references: 94347 chars
After remove_metadata: 94347

Batch 4: 100%|██████████| 10/10 [00:58<00:00,  5.89s/it]


No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1301 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_PIIS0039625723001030.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_PIIS0039625723001030.txt
Percentage of words retained after cleaning: 88.45%
Completed processing batch 4

Processing batch 5 of 5


Batch 5:   0%|          | 0/10 [00:00<?, ?it/s]


Processing file: diagnostics-13-00850.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 6901
Starting preprocessing. Initial text length: 41694
After reassemble_hyphenated_words: 41670 chars
After remove_figures_tables: 41670 chars
After remove_citations: 40784 chars
After remove_urls: 40559 chars
After remove_emails: 40525 chars
After remove_numerical_references: 40525 chars
After remove_headers: 40525 chars
After remove_references: 40525 chars
After remove_metadata: 40473 chars
After remove_institution_names: 40473 chars
After remove_copyright_info: 40440 chars
After remove_doi_and_journal_info: 40440 chars
After remove_artifacts: 39867 chars
After detect_sentence_boundaries: 39870 chars
After removing extra whitespace: 39870 chars
Text preprocessing complete.
Starting preprocessing. Initial text length: 41694
After reassemble_hyphenated_words: 41670 chars
After remove_figures_tables: 41670 chars
After remove_citations: 

Batch 5:  10%|█         | 1/10 [00:03<00:29,  3.26s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 434 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_diagnostics-13-00850.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_diagnostics-13-00850.txt
Percentage of words retained after cleaning: 97.07%

Processing file: nihms880229.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 13744
Starting preprocessing. Initial text length: 86083
After reassemble_hyphenated_words: 86059 chars
After remove_figures_tables: 86053 chars
After remove_citations: 81853 chars
After remove_urls: 81796 chars
After remove_emails: 81775 chars
After remove_numerical_references: 81775 chars
After remove_headers: 81775 chars
After remove_references: 81775 chars
After remove_metadata: 81775 chars
After remov

Batch 5:  20%|██        | 2/10 [00:11<00:51,  6.44s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 213 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_nihms880229.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_nihms880229.txt
Percentage of words retained after cleaning: 95.07%

Processing file: 41598_2021_Article_81093.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 10654
Starting preprocessing. Initial text length: 65135
After reassemble_hyphenated_words: 65089 chars
After remove_figures_tables: 65084 chars
After remove_citations: 64254 chars
After remove_urls: 62738 chars
After remove_emails: 62738 chars
After remove_numerical_references: 62738 chars
After remove_headers: 62738 chars
After remove_references: 62738 chars
After remove_metadata: 62737 chars
After remove_ins

Batch 5:  30%|███       | 3/10 [00:16<00:40,  5.80s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1289 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_41598_2021_Article_81093.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_41598_2021_Article_81093.txt
Percentage of words retained after cleaning: 95.97%

Processing file: main.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 11965
Starting preprocessing. Initial text length: 75891
After reassemble_hyphenated_words: 75643 chars
After remove_figures_tables: 75637 chars
After remove_citations: 75287 chars
After remove_urls: 74847 chars
After remove_emails: 74847 chars
After remove_numerical_references: 74847 chars
After remove_headers: 74847 chars
After remove_references: 74847 chars
After remove_metadata: 74776 chars
After rem

Batch 5:  40%|████      | 4/10 [00:22<00:34,  5.75s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1442 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_main.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_main.txt
Percentage of words retained after cleaning: 97.66%

Processing file: nihms-1933615.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 4380
Starting preprocessing. Initial text length: 29775
After reassemble_hyphenated_words: 29769 chars
After remove_figures_tables: 29769 chars
After remove_citations: 27504 chars
After remove_urls: 27384 chars
After remove_emails: 27368 chars
After remove_numerical_references: 27368 chars
After remove_headers: 27341 chars
After remove_references: 27341 chars
After remove_metadata: 27341 chars
After remove_institution_names: 27245 cha

Batch 5:  50%|█████     | 5/10 [00:24<00:21,  4.40s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 1054 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_nihms-1933615.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_nihms-1933615.txt
Percentage of words retained after cleaning: 90.91%

Processing file: nihms-1685213.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 11729
Starting preprocessing. Initial text length: 77452
After reassemble_hyphenated_words: 77419 chars
After remove_figures_tables: 77419 chars
After remove_citations: 65751 chars
After remove_urls: 65675 chars
After remove_emails: 65647 chars
After remove_numerical_references: 65647 chars
After remove_headers: 65647 chars
After remove_references: 65647 chars
After remove_metadata: 65647 chars
After remove_instituti

Batch 5:  60%|██████    | 6/10 [00:29<00:18,  4.65s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 877 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_nihms-1685213.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_nihms-1685213.txt
Percentage of words retained after cleaning: 83.90%

Processing file: diagnostics-13-02413-compressed.txt
Detected encoding: MacRoman (confidence: 0.68)
Successfully read file with MacRoman encoding.
Original word count: 18503
Starting preprocessing. Initial text length: 116448
After reassemble_hyphenated_words: 116288 chars
After remove_figures_tables: 116288 chars
After remove_citations: 113336 chars
After remove_urls: 113191 chars
After remove_emails: 113024 chars
After remove_numerical_references: 113024 chars
After remove_headers: 113024 chars
After remove_references: 113024 chars
After remove_metadata: 11

Batch 5:  70%|███████   | 7/10 [00:38<00:17,  5.93s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 539 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_diagnostics-13-02413-compressed.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_diagnostics-13-02413-compressed.txt
Percentage of words retained after cleaning: 96.63%

Processing file: 41525_2021_Article_180.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 7141
Starting preprocessing. Initial text length: 45896
After reassemble_hyphenated_words: 45844 chars
After remove_figures_tables: 45844 chars
After remove_citations: 45430 chars
After remove_urls: 44853 chars
After remove_emails: 44838 chars
After remove_numerical_references: 44838 chars
After remove_headers: 44838 chars
After remove_references: 44838 chars
After remove_m

Batch 5:  80%|████████  | 8/10 [00:41<00:10,  5.15s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 692 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_41525_2021_Article_180.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_41525_2021_Article_180.txt
Percentage of words retained after cleaning: 96.89%

Processing file: fgene-13-858556.txt
Detected encoding: utf-8 (confidence: 0.99)
Successfully read file with utf-8 encoding.
Original word count: 7409
Starting preprocessing. Initial text length: 47596
After reassemble_hyphenated_words: 47586 chars
After remove_figures_tables: 47586 chars
After remove_citations: 46038 chars
After remove_urls: 45886 chars
After remove_emails: 45844 chars
After remove_numerical_references: 45844 chars
After remove_headers: 45844 chars
After remove_references: 45844 chars
After remove_metadata: 45760 chars
Afte

Batch 5:  90%|█████████ | 9/10 [00:45<00:04,  4.67s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 639 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_fgene-13-858556.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_fgene-13-858556.txt
Percentage of words retained after cleaning: 94.87%

Processing file: biomolecules-13-00271.txt
Detected encoding: MacRoman (confidence: 0.68)
Successfully read file with MacRoman encoding.
Original word count: 34716
Starting preprocessing. Initial text length: 218234
After reassemble_hyphenated_words: 218060 chars
After remove_figures_tables: 218060 chars
After remove_citations: 210407 chars
After remove_urls: 210251 chars
After remove_emails: 210233 chars
After remove_numerical_references: 210233 chars
After remove_headers: 210233 chars
After remove_references: 210233 chars
After remove_metadata: 210182 c

Batch 5: 100%|██████████| 10/10 [01:04<00:00,  6.42s/it]

No start heading detected, assuming paragraph 0 as the start of the abstract.
Taking first 3 paragraphs as abstract since no end target found.
Extracted abstract: 507 characters
Abstract saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/abstract_biomolecules-13-00271.txt
Cleaned text saved to: /mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed/preprocessed_biomolecules-13-00271.txt
Percentage of words retained after cleaning: 96.62%
Completed processing batch 5
All batches processed.
Preprocessing complete.



