In [7]:
import re
import unicodedata
import logging
from pathlib import Path
import nltk
from nltk.corpus import stopwords
import spacy
from collections import Counter
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Load spaCy model
nlp = spacy.load("en_core_web_sm")
print("Loaded en_core_web_sm model")

# Custom stopwords - keep more words for readability
custom_stopwords = set(stopwords.words('english')) - {'no', 'not', 'nor', 'against', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}

def setup_logging(log_file='processing.log'):
    """Set up logging configuration."""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ]
    )

def clean_text(text):
    """Basic text cleaning."""
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

def extract_important_phrases(text, n=100):
    """Extract important phrases using TF-IDF."""
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()[0]
    important_phrases = sorted(zip(feature_names, tfidf_scores), key=lambda x: x[1], reverse=True)[:n]
    return [phrase for phrase, score in important_phrases]

def identify_medical_entities(text):
    """Identify medical entities using spaCy."""
    doc = nlp(text)
    medical_entities = [ent.text for ent in doc.ents if ent.label_ in ['DISEASE', 'CHEMICAL', 'GENE']]
    return list(set(medical_entities))

def preprocess_text(text):
    """Preprocess the input text by cleaning and extracting important information."""
    try:
        # Basic cleaning
        cleaned_text = clean_text(text)
        
        # Extract important phrases
        important_phrases = extract_important_phrases(cleaned_text)
        
        # Identify medical entities
        medical_entities = identify_medical_entities(text)
        
        # Tokenize sentences
        sentences = nltk.sent_tokenize(text)
        
        # Process sentences
        processed_sentences = []
        for sent in sentences:
            words = nltk.word_tokenize(sent)
            preserved_words = [word for word in words 
                               if word.lower() not in custom_stopwords 
                               or word in important_phrases
                               or word in medical_entities
                               or word.isdigit()]
            processed_sentences.append(' '.join(preserved_words))
        
        # Join processed sentences
        processed_text = ' '.join(processed_sentences)
        
        return processed_text.strip()
    except Exception as e:
        logging.error(f"Error in preprocess_text: {e}")
        return ""

def process_papers(input_dir, output_dir, batch_size=10):
    """Process text files in batches in the input directory and save cleaned versions in the output directory."""
    try:
        input_path = Path(input_dir)
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)

        all_files = list(input_path.glob('*.txt'))
        for i in range(0, len(all_files), batch_size):
            batch = all_files[i:i+batch_size]
            for text_file in tqdm(batch, desc=f"Processing batch {i//batch_size + 1}"):
                try:
                    with open(text_file, 'r', encoding='utf-8') as f:
                        text = f.read()
                    clean_text = preprocess_text(text)
                    if clean_text:  # Ensure there's something to write
                        output_file = output_path / f"{text_file.stem}_cleaned.txt"
                        with open(output_file, 'w', encoding='utf-8') as f:
                            f.write(clean_text)
                        logging.info(f"Processed and saved: {output_file}")
                    else:
                        logging.warning(f"No content to write for file: {text_file}")
                except Exception as e:
                    logging.error(f"Error processing file {text_file}: {e}")

        logging.info("Processing complete.")
    except Exception as e:
        logging.error(f"Error in process_papers: {e}")

def main(input_dir, output_dir, batch_size):
    """Main function to set up logging and start the processing of papers."""
    setup_logging()
    process_papers(input_dir, output_dir, batch_size)

if __name__ == "__main__":
    input_dir = './txt_data'
    output_dir = './preprocessed_data'
    batch_size = 10
    main(input_dir, output_dir, batch_size)

Loaded en_core_web_sm model


Processing batch 1:   0%|          | 0/10 [00:00<?, ?it/s]2024-07-23 00:17:11,078 - INFO - Processed and saved: preprocessed_data/cells-12-02579-compressed_cleaned.txt
Processing batch 1:  10%|█         | 1/10 [00:00<00:04,  2.01it/s]2024-07-23 00:17:11,328 - INFO - Processed and saved: preprocessed_data/ijms-22-07207_cleaned.txt
Processing batch 1:  20%|██        | 2/10 [00:00<00:02,  2.84it/s]2024-07-23 00:17:11,735 - INFO - Processed and saved: preprocessed_data/biomolecules-12-00455-compressed_cleaned.txt
Processing batch 1:  30%|███       | 3/10 [00:01<00:02,  2.65it/s]2024-07-23 00:17:11,982 - INFO - Processed and saved: preprocessed_data/EMMM-14-e15941-compressed_cleaned.txt
Processing batch 1:  40%|████      | 4/10 [00:01<00:01,  3.07it/s]2024-07-23 00:17:12,414 - INFO - Processed and saved: preprocessed_data/jcm-12-06953_cleaned.txt
Processing batch 1:  50%|█████     | 5/10 [00:01<00:01,  2.75it/s]2024-07-23 00:17:12,626 - INFO - Processed and saved: preprocessed_data/41436_20