In [1]:
pip install PyMuPDF nltk

Note: you may need to restart the kernel to use updated packages.


In [6]:
import os
import nltk
import fitz
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re  

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
    except Exception as e:
        print(f"Error processing PDF {pdf_path}: {e}")
    return text

def remove_urls(text):
    url_pattern = r'https?://\S+|www\.\S+'
    return re.sub(url_pattern, '', text)

def preprocess_text(text, preserve_numbers=True):
    text = remove_urls(text)  
    tokens = word_tokenize(text.lower())
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() or (preserve_numbers and token.isdigit())]
    
    my_stopwords = set(stopwords.words('english')) - {'some', 'any', 'no', 'not', 'only'}
    filtered_tokens = [token for token in lemmatized_tokens if token not in my_stopwords]
    
    preprocessed_text = ' '.join(filtered_tokens)
    return preprocessed_text

root_dir = r'C:\Users\steph\OneDrive\Documents\GitHub\ML-Literature-Search-Engine-Resources'

for root, dirs, files in os.walk(root_dir, topdown=True):
    dirs[:] = [d for d in dirs if d not in {'Processed', '.git'}]  # Exclude 'Processed' and '.git' directories
    for dir_name in dirs:
        raw_path = os.path.join(root, dir_name, 'Raw')
        processed_path = os.path.join(root, dir_name, 'Processed')
        
        if os.path.exists(raw_path):
            if not os.path.exists(processed_path):
                os.makedirs(processed_path)
            
            for file_name in os.listdir(raw_path):
                file_path = os.path.join(raw_path, file_name)
                output_file_name = os.path.splitext(file_name)[0] + '_processed.txt'
                output_file_path = os.path.join(processed_path, output_file_name)
                
                if os.path.exists(output_file_path):
                    print(f"Skipping already processed file: {output_file_path}")
                    continue
                
                if file_path.lower().endswith('.pdf'):
                    text = extract_text_from_pdf(file_path)
                    preprocessed_text = preprocess_text(text)
                    
                    try:
                        with open(output_file_path, 'w', encoding='utf-8') as f:
                            f.write(preprocessed_text)
                        print(f"Processed file saved to: {output_file_path}")
                    except Exception as e:
                        print(f"Error saving processed file {output_file_path}: {e}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\steph\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\steph\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\steph\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Skipping already processed file: C:\Users\steph\OneDrive\Documents\GitHub\ML-Literature-Search-Engine-Resources\Deep Learning\Processed\Alzubaidi_2021_Revew DL concepts_processed.txt
Skipping already processed file: C:\Users\steph\OneDrive\Documents\GitHub\ML-Literature-Search-Engine-Resources\Deep Learning\Processed\Angermueller-2016-DL_processed.txt
Skipping already processed file: C:\Users\steph\OneDrive\Documents\GitHub\ML-Literature-Search-Engine-Resources\Deep Learning\Processed\Cao-2018-DL_processed.txt
Skipping already processed file: C:\Users\steph\OneDrive\Documents\GitHub\ML-Literature-Search-Engine-Resources\Deep Learning\Processed\Chahal_2019_Machine Learning and Deep Learning_processed.txt
Skipping already processed file: C:\Users\steph\OneDrive\Documents\GitHub\ML-Literature-Search-Engine-Resources\Deep Learning\Processed\Chen-2014-DL_processed.txt
Skipping already processed file: C:\Users\steph\OneDrive\Documents\GitHub\ML-Literature-Search-Engine-Resources\Deep Learnin