In [41]:
# Import libraries
import spacy
from nltk.corpus import stopwords
from collections import Counter
import nltk
from scispacy.linking import EntityLinker
import scispacy
from tqdm import tqdm
import re
import unicodedata
import os
import logging
import chardet
from collections import OrderedDict
import pandas as pd


%matplotlib inline
import matplotlib.pyplot as plt

In [42]:
# nltk.download('punkt')
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

nlp = spacy.load("en_core_sci_md")
nlp.add_pipe("scispacy_linker", last=True)

print("Setup completed.")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Setup completed.


In [43]:
def extract_entities_with_links(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        # Get the linked concepts for each entity
        for umls_ent in ent._.kb_ents:
            concept_id, score = umls_ent
            entities.append({
                "text": ent.text,
                "label": ent.label_,
                "concept_id": concept_id,
                "score": score
            })
    return entities

## **Text Preprocessing**

## **Text Cleaning**

In [44]:
# reassemble_hyphenated_words
def reassemble_hyphenated_words(text):
    return re.sub(r'(\w+)-\s*\n(\w+)', r'\1\2', text)

def remove_urls(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    return text


def remove_emails(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    return re.sub(email_pattern, '', text, flags=re.IGNORECASE)

def remove_figures_tables(text):
    return re.sub(r'\b(figures?|tables?)\b', '', text)

def remove_numerical_references(text):
    return re.sub(r'\[\d+\]', '', text)

def remove_citations(text):
    patterns = [
        r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\s+[A-Z][a-z]+\s+et\s+al\.',
        r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\s+et\s+al\.',
        r'\b[A-Z][a-z]+\s+et\s+al\.',
        r'\(.*?et al\..*?\d{4}.*?\)',
        r'\[.*?\]',
        r'\(\d{4}[a-z]?(?:,\s*\d{4}[a-z]?)*\)',
        r'^.*?\d{4};.*?:\s*\d+.*?$',
        r'^.*?\d{4};.*?:\s*\d+.*?$',  # Matches journal info like "2024;258: 119– 129"
    ]

    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE | re.IGNORECASE)
    return text

def remove_headers(text):
    # Remove lines that are all uppercase and end with a colon
    text = re.sub(r'^[A-Z\s]+:$', '', text, flags=re.MULTILINE)
    # Remove lines that start with bullet points
    text = re.sub(r'^\s*•.*$', '', text, flags=re.MULTILINE)
    return text

def remove_metadata(text):
    # Remove headers, copyright info, DOI, received/accepted dates
    patterns = [
        r'^.*?©Copyright.*$',
        r'^DOI:.*$',
        r'^Received:.*$',
        r'^Accepted:.*$',
        r'^Address for Correspondence:.*$',
        r'^E-mail:.*$',
        r'^ORCID-ID:.*$',
        r'^\s*\d+\s*$',  # Page numbers
        r'^.*?ORCID:.*$',
        r'^Cite this article as:.*$',
        r'\[\s*[^\w\s]*\s*\]'
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return text

# def remove_institution_names(text):
#     # Remove institution names (this is a simplified approach and may need refinement)
#     pattern = r'\*+[A-Z][A-Za-z\s,]+(University|Institute|Hospital|Department|Faculty)[^\n]*'
#     return re.sub(pattern, '', text, flags=re.MULTILINE)

def handle_special_characters(text):
    """Handle special characters and Unicode normalization."""
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
    char_map = {
        '´': "'", '‘': "'", '’': "'", '“': '"', '”': '"', '–': '-', '—': '-', '…': '...'
    }
    for char, replacement in char_map.items():
        text = text.replace(char, replacement)
    return text

def remove_copyright_info(text):
    patterns = [
        r'^©.*$',
        r'Copyright.*$',
        r'This is an open access article.*$',
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE | re.IGNORECASE)
    return text

def remove_doi_and_journal_info(text):
    patterns = [
        r'DOI:.*$',
        r'^.*?\d{4};\d+:\d+–\d+',  # Matches journal info like "2024;258: 119– 129"
    ]
    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return text

def remove_artifacts(text):
    # Remove license and DOI information
    text = re.sub(r'BY license \(.*?\)\..*?commons\.org/licenses/by/\d\.\d/\s*\)\.', '', text)
    text = re.sub(r'://doi\.org/\d+\.\d+/[^\s]+', '', text )
    text = re.sub(r'://creativecommons\.org/licenses/by/\d\.\d/', '', text)
    text = re.sub(r'://creativecommons\.org/licenses/by-\w+/\d\.\d/', '', text)
    
    
    # Remove unnecessary symbols
    text = re.sub(r'[⁎\]]', '', text)
    
    # Remove empty parentheses and brackets
    text = re.sub(r'\(\s*\)|\[\s*\]', '', text)
    
    # Remove isolated semicolons
    text = re.sub(r'\s*;\s*', ' ', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    return text


def remove_references(text):
    """
    Remove the references section from the text. This function looks for the word 'References'
    followed by '1.' and removes everything from that point onward.
    """
    # Pattern to match 'References' followed by '1.' 
    reference_pattern = r'\bReferences\s+1\.\s'

    # Search for the pattern in the text
    match = re.search(reference_pattern, text, re.IGNORECASE)
    
    if match:
        # If 'References 1.' is found, remove everything from that point onward
        return text[:match.start()].strip()
    
    # If no references section is found, return the original text
    return text


def detect_sentence_boundaries(text):
    """
    Detect sentence boundaries using spaCy.
    """
    doc = nlp(text)
    spacy_sentences = [sent.text for sent in doc.sents]
    return ' '.join(spacy_sentences)

In [45]:
def count_words(text):
    if not isinstance(text, str):
        return 0
    
    # Remove punctuation except for hyphens within words and apostrophes
    text = re.sub(r'[^\w\s\'-]|(?<!\w)[-\']|[-\'](?!\w)', '', text)
    
    # Split on whitespace and count non-empty words (including numbers)
    words = [word for word in text.split() if word]
    
    return len(words)

def preprocess_text(text):
    try:
        # Input validation
        if not isinstance(text, (str, bytes)):
            raise TypeError('Expected a string or bytes-like object')

        print(f"Starting preprocessing...")
        initial_word_count = count_words(text)
        print(f"Initial word count: {initial_word_count}")

        # Step 1: Initial text cleaning
        cleaned_text = reassemble_hyphenated_words(text)
        print(f"After reassemble_hyphenated_words: {count_words(cleaned_text)} words")

        cleaned_text = remove_figures_tables(cleaned_text)
        print(f"After remove_figures_tables: {count_words(cleaned_text)} words")

        cleaned_text = remove_citations(cleaned_text)
        print(f"After remove_citations: {count_words(cleaned_text)} words")

        cleaned_text = remove_urls(cleaned_text)
        print(f"After remove_urls: {count_words(cleaned_text)} words")

        cleaned_text = remove_emails(cleaned_text)
        print(f"After remove_emails: {count_words(cleaned_text)} words")

        cleaned_text = remove_numerical_references(cleaned_text)
        print(f"After remove_numerical_references: {count_words(cleaned_text)} words")

        cleaned_text = remove_headers(cleaned_text)
        print(f"After remove_headers: {count_words(cleaned_text)} words")

        cleaned_text = remove_metadata(cleaned_text)
        print(f"After remove_metadata: {count_words(cleaned_text)} words")

        cleaned_text = remove_copyright_info(cleaned_text)
        print(f"After remove_copyright_info: {count_words(cleaned_text)} words")

        cleaned_text = remove_doi_and_journal_info(cleaned_text)
        print(f"After remove_doi_and_journal_info: {count_words(cleaned_text)} words")

        cleaned_text = remove_artifacts(cleaned_text)
        print(f"After remove_artifacts: {count_words(cleaned_text)} words")

        # Remove extra whitespace
        cleaned_text = ' '.join(cleaned_text.split())
        print(f"After removing extra whitespace: {count_words(cleaned_text)} words")

        cleaned_text = handle_special_characters(cleaned_text)
        print(f"After handle_special_characters: {count_words(cleaned_text)} words")

        cleaned_text = remove_references(cleaned_text)
        print(f"After remove_references: {count_words(cleaned_text)} words")

        cleaned_text = detect_sentence_boundaries(cleaned_text)
        print(f"After detect_sentence_boundaries: {count_words(cleaned_text)} words")

        final_word_count = count_words(cleaned_text)
        print(f"Final preprocessed word count: {final_word_count}")

        if initial_word_count > 0:
            percentage_retained = (final_word_count / initial_word_count) * 100
            print(f"Percentage of words retained after cleaning: {percentage_retained:.2f}%")
        else:
            print("Original text contains no words; cannot calculate percentage.")

        return cleaned_text

    except TypeError as te:
        logging.error(f"TypeError in preprocess_text: {te}")
        return None
    except Exception as e:
        logging.error(f"Error in preprocess_text: {e}")
        return None

In [46]:
def extract_abstract_and_main_text(cleaned_text):
    try:
        abstract_start_patterns = [
            r'\b(Abstract|Synopsis|Summary|Overview)\b:?',
            r'^\s*(Background|Purpose|Objective|Aim)\b:?',
            r'^\s*[A-Z][^.!?]+(?=[.!?])\s'  # Captures the first sentence if it's capitalized
        ]
        abstract_end_patterns = [
            r'\b(Keywords|Key words|Index terms)\b:?',
            r'\n{2,}\s*(Introduction|Main Text|Methods|Methodology|Materials)\b:?'
        ]
        
        abstract_start = None
        abstract_end = None

        # Search for the start of the abstract
        for pattern in abstract_start_patterns:
            match = re.search(pattern, cleaned_text, re.IGNORECASE | re.MULTILINE)
            if match:
                abstract_start = match.start()
                break

        # If we cannot determine the start of the abstract, return empty abstract
        if abstract_start is None:
            return "", cleaned_text

        # Search for the end of the abstract
        for pattern in abstract_end_patterns:
            match = re.search(pattern, cleaned_text[abstract_start:], re.IGNORECASE | re.MULTILINE)
            if match:
                abstract_end = abstract_start + match.start()
                break

        # If no clear end is found, use structural heuristics
        if abstract_end is None:
            paragraphs = cleaned_text[abstract_start:].split('\n\n')
            if len(paragraphs) > 1:
                abstract_end = abstract_start + len(paragraphs[0])
            else:
                # Fallback to a maximum word limit
                words = cleaned_text[abstract_start:].split()
                abstract_end = abstract_start + len(' '.join(words[:300]))

        # Extract the abstract content
        abstract = cleaned_text[abstract_start:abstract_end].strip()

        # Ensure the abstract is between 50 and 500 words
        abstract_words = abstract.split()
        if len(abstract_words) < 50:
            return "", cleaned_text  # Return empty abstract if too short
        elif len(abstract_words) > 500:
            abstract = ' '.join(abstract_words[:500])
            abstract_end = abstract_start + len(abstract)

        # Extract the main text starting from the end of the abstract
        main_text = cleaned_text[abstract_end:].strip()

        return abstract, main_text

    except Exception as e:
        logging.error(f"Error in extract_abstract_and_main_text: {e}")
        return "", cleaned_text  # Return empty abstract and full text as main_text in case of any error

In [47]:
def process_files(input_folder_path: str, output_folder_path: str, entities_path: str, batch_size: int = 10) -> None:
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    if not os.path.exists(entities_path):
        os.makedirs(entities_path)

    files = [f for f in os.listdir(input_folder_path) if f.endswith('.txt')]
    total_files = len(files)

    all_entities = []

    for i in range(0, total_files, batch_size):
        batch = files[i:i+batch_size]
        print(f"\nProcessing batch {i//batch_size + 1} of {(total_files-1)//batch_size + 1}")

        for filename in tqdm(batch, desc=f"Batch {i//batch_size + 1}"):
            input_file_path = os.path.join(input_folder_path, filename)
            output_file_path = os.path.join(output_folder_path, f"processed_{filename}")

            print(f"\nProcessing file: {filename}")

            # Corrected section in process_files
            try:
                with open(input_file_path, 'rb') as file:
                    raw_data = file.read()
                    result = chardet.detect(raw_data)
                    detected_encoding = result['encoding']
                    confidence = result['confidence']

                    print(f"Detected encoding: {detected_encoding} with confidence: {confidence}")

                # Fallback to utf-8 if encoding is not detected or confidence is low
                encoding_to_use = detected_encoding if detected_encoding else 'utf-8'

                with open(input_file_path, 'r', encoding=encoding_to_use) as file:
                    original_text = file.read()

                # Ensure the text is correctly read as a string
                if not isinstance(original_text, str):
                    raise ValueError("File content is not a valid string.")

                # Extract abstract and main text
                abstract, main_text = extract_abstract_and_main_text(original_text)

                # Preprocess the main text
                main_text = preprocess_text(main_text)

                # Extract entities
                entities = extract_entities_with_links(main_text)
                all_entities.extend(entities)

                # Prepare output text
                output_text = f"Abstract:\n{abstract}\n\nMain Text:\n{main_text}"

                # Write the cleaned text and abstract to the output file
                with open(output_file_path, 'w', encoding='utf-8') as file:
                    file.write(output_text)

            except Exception as e:
                logging.error(f"Error processing file {filename}: {str(e)}")

    # Save the extracted entities to a CSV file
    entities_df = pd.DataFrame(all_entities)
    entities_df.to_csv(os.path.join(entities_path, 'extracted_entities.csv'), index=False)

In [48]:
def identify_frequent_terms(text, n=10, min_length=3):

    # Tokenize the text into words
    words = re.findall(r'\b[a-zA-Z]{' + str(min_length) + r',}\b', text.lower())
    
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    
    # Count word frequencies
    word_freq = Counter(words)
    
    # Return the n most common words with their counts
    return word_freq.most_common(n)

In [49]:
if __name__ == "__main__":
    # input_folder = "/mnt/data/skanda/MSc_IRD_LLM/data/txt_data"
    input_folder = "/mnt/data/skanda/MSc_IRD_LLM/data/data_text_sample"
    output_folder = "/mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed"
    entities_path = "/mnt/data/skanda/MSc_IRD_LLM/data/Entities/"
    process_files(input_folder, output_folder, entities_path, batch_size=10)
    print("Preprocessing and entity extraction complete.")


Processing batch 1 of 1


Batch 1:   0%|          | 0/6 [00:00<?, ?it/s]


Processing file: 13287_2023_Article_3526.txt
Detected encoding: utf-8 with confidence: 0.99
Starting preprocessing...
Initial word count: 6850
After reassemble_hyphenated_words: 6826 words
After remove_figures_tables: 6826 words
After remove_citations: 6160 words
After remove_urls: 6153 words
After remove_emails: 6152 words
After remove_numerical_references: 6152 words
After remove_headers: 6152 words
After remove_metadata: 6144 words
After remove_copyright_info: 6118 words
After remove_doi_and_journal_info: 6118 words
After remove_artifacts: 6126 words
After removing extra whitespace: 6126 words
After handle_special_characters: 6126 words
After remove_references: 4171 words


After detect_sentence_boundaries: 4174 words
Final preprocessed word count: 4174
Percentage of words retained after cleaning: 60.93%


Batch 1:  17%|█▋        | 1/6 [00:01<00:09,  1.84s/it]


Processing file: 12886_2023_Article_2772-compressed.txt
Detected encoding: utf-8 with confidence: 0.99
Starting preprocessing...
Initial word count: 4410
After reassemble_hyphenated_words: 4399 words
After remove_figures_tables: 4399 words
After remove_citations: 4202 words
After remove_urls: 4201 words
After remove_emails: 4200 words
After remove_numerical_references: 4200 words
After remove_headers: 4200 words
After remove_metadata: 4192 words
After remove_copyright_info: 4192 words
After remove_doi_and_journal_info: 4192 words
After remove_artifacts: 4193 words
After removing extra whitespace: 4193 words
After handle_special_characters: 4193 words
After remove_references: 3752 words
After detect_sentence_boundaries: 3754 words
Final preprocessed word count: 3754
Percentage of words retained after cleaning: 85.12%


Batch 1:  33%|███▎      | 2/6 [00:03<00:06,  1.65s/it]


Processing file: 13023_2021_Article_2145.txt
Detected encoding: utf-8 with confidence: 0.99
Starting preprocessing...
Initial word count: 6328
After reassemble_hyphenated_words: 6307 words
After remove_figures_tables: 6307 words
After remove_citations: 5920 words
After remove_urls: 5900 words
After remove_emails: 5899 words
After remove_numerical_references: 5899 words
After remove_headers: 5899 words
After remove_metadata: 5891 words
After remove_copyright_info: 5865 words
After remove_doi_and_journal_info: 5865 words
After remove_artifacts: 5866 words
After removing extra whitespace: 5866 words
After handle_special_characters: 5866 words
After remove_references: 4677 words
After detect_sentence_boundaries: 4677 words
Final preprocessed word count: 4677
Percentage of words retained after cleaning: 73.91%


Batch 1:  67%|██████▋   | 4/6 [00:05<00:02,  1.14s/it]


Processing file: 41433_2022_Article_2262.txt
Detected encoding: utf-8 with confidence: 0.99
Starting preprocessing...
Initial word count: 455
After reassemble_hyphenated_words: 453 words
After remove_figures_tables: 453 words
After remove_citations: 371 words
After remove_urls: 370 words
After remove_emails: 370 words
After remove_numerical_references: 370 words
After remove_headers: 370 words
After remove_metadata: 369 words
After remove_copyright_info: 369 words
After remove_doi_and_journal_info: 369 words
After remove_artifacts: 369 words
After removing extra whitespace: 369 words
After handle_special_characters: 369 words
After remove_references: 369 words
After detect_sentence_boundaries: 370 words
Final preprocessed word count: 370
Percentage of words retained after cleaning: 81.32%

Processing file: 41436_2020_Article_759.txt
Detected encoding: utf-8 with confidence: 0.99
Starting preprocessing...
Initial word count: 3511
After reassemble_hyphenated_words: 3499 words
After remo

Batch 1:  83%|████████▎ | 5/6 [00:06<00:01,  1.04s/it]


Processing file: 13023_2023_Article_2798.txt
Detected encoding: utf-8 with confidence: 0.99
Starting preprocessing...
Initial word count: 8723
After reassemble_hyphenated_words: 8695 words
After remove_figures_tables: 8693 words
After remove_citations: 8042 words
After remove_urls: 8034 words
After remove_emails: 8034 words
After remove_numerical_references: 8034 words
After remove_headers: 7963 words
After remove_metadata: 7955 words
After remove_copyright_info: 7955 words
After remove_doi_and_journal_info: 7955 words
After remove_artifacts: 7957 words
After removing extra whitespace: 7957 words
After handle_special_characters: 7957 words
After remove_references: 6251 words
After detect_sentence_boundaries: 6255 words
Final preprocessed word count: 6255
Percentage of words retained after cleaning: 71.71%


Batch 1: 100%|██████████| 6/6 [00:08<00:00,  1.49s/it]

Preprocessing and entity extraction complete.





In [50]:
import os

def separate_abstract_and_main_text(text: str) -> tuple:
    """
    Separates the abstract and main text from the given text.
    
    :param text: The input text containing both abstract and main text.
    :return: A tuple containing the abstract and main text as separate strings.
    """
    abstract_marker = "Abstract:"
    main_text_marker = "Main Text:"
    
    # Find the start of the abstract
    abstract_start = text.find(abstract_marker)
    # Find the start of the main text
    main_text_start = text.find(main_text_marker)
    
    # Extract abstract and main text
    if abstract_start != -1 and main_text_start != -1:
        abstract = text[abstract_start + len(abstract_marker):main_text_start].strip()
        main_text = text[main_text_start + len(main_text_marker):].strip()
    else:
        raise ValueError("The text does not contain the required markers for Abstract and Main Text.")
    
    return abstract, main_text

def process_preprocessed_files(input_folder_path: str, output_folder_path: str) -> None:
    """
    Processes all .txt files in the input folder, separates abstracts and main texts,
    and saves them into the output folder.
    
    :param input_folder_path: The path to the input folder containing .txt files.
    :param output_folder_path: The path to the output folder where separated files will be saved.
    """
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)
    
    # Get a list of all .txt files in the input directory
    files = [f for f in os.listdir(input_folder_path) if f.endswith('.txt')]
    
    for filename in files:
        input_file_path = os.path.join(input_folder_path, filename)
        print(f"Processing file: {filename}")
        
        try:
            # Read the content of the file
            with open(input_file_path, 'r', encoding='utf-8') as file:
                file_content = file.read()
            
            # Separate abstract and main text
            abstract, main_text = separate_abstract_and_main_text(file_content)
            
            # Define paths for the abstract and main text files
            abstract_output_path = os.path.join(output_folder_path, f"{filename[:-4]}_abstract.txt")
            main_text_output_path = os.path.join(output_folder_path, f"{filename[:-4]}_main_text.txt")
            
            # Save the abstract
            with open(abstract_output_path, 'w', encoding='utf-8') as abstract_file:
                abstract_file.write(abstract)
            
            # Save the main text
            with open(main_text_output_path, 'w', encoding='utf-8') as main_text_file:
                main_text_file.write(main_text)
            
            print(f"Successfully processed and saved: {filename}")
        
        except Exception as e:
            print(f"Error processing file {filename}: {str(e)}")

# Example usage:
input_folder = '/mnt/data/skanda/MSc_IRD_LLM/data/data_preprocessed'
output_folder = '/mnt/data/skanda/MSc_IRD_LLM/data/data_separated'

process_preprocessed_files(input_folder, output_folder)


Processing file: processed_13287_2023_Article_3526.txt
Successfully processed and saved: processed_13287_2023_Article_3526.txt
Processing file: processed_13023_2023_Article_2798.txt
Successfully processed and saved: processed_13023_2023_Article_2798.txt
Processing file: processed_41436_2020_Article_759.txt
Successfully processed and saved: processed_41436_2020_Article_759.txt
Processing file: processed_13023_2021_Article_2145.txt
Successfully processed and saved: processed_13023_2021_Article_2145.txt
Processing file: processed_41433_2022_Article_2262.txt
Successfully processed and saved: processed_41433_2022_Article_2262.txt
Processing file: processed_12886_2023_Article_2772-compressed.txt
Successfully processed and saved: processed_12886_2023_Article_2772-compressed.txt
