In [1]:
import re
import fitz  # PyMuPDF

def clean_text(text):
    """
    Clean extracted text to exclude lines that resemble mathematical expressions,
    index entries, or table of contents entries (e.g., lines with ellipses or numbers).dd
    """
    # Regular expression pattern to identify unwanted lines
    unwanted_patterns = [
        r"\.\.\.+\s*\d+",            # Lines with ellipses followed by numbers
        r"^\d+\.\d+\s",              # Lines starting with a decimal number (e.g., "1.1 ")
        r"\b\d+\b",                  # Lines containing isolated numbers (page numbers, indexes)
        r"[A-Za-z0-9]+[\^\+\-*/=<>]", # Lines with mathematical operators
        r"[A-Za-z0-9]+\s*[∈∀∃∅⊆∪∩≈∑∏∫θμϵλϕδΩ→≤≥]", # Lines with symbols commonly in math
        r"[θμϵδ]+",                  # Greek symbols or Greek-like variables
        r"[<>≤≥=]{2,}",              # Comparison operators often found in math expressions
        r"^\d+$",                    # Lines with only numbers
    ]
    
    # Combine all patterns into one
    combined_pattern = re.compile("|".join(unwanted_patterns))
    
    # Split the text into lines, filter them, and rejoin
    filtered_lines = [
        line for line in text.splitlines()
        if not combined_pattern.search(line)
    ]
    
    return "\n".join(filtered_lines)

def extract_text_from_pdf(pdf_path, output_txt_path):
    """
    Extract text from a PDF, clean it using clean_text function, and save it to a text file.
    """
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    full_text = ""

    # Iterate through pages and extract text
    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        full_text += page.get_text()

    # Clean the extracted text
    cleaned_text = clean_text(full_text)
    
    # Save cleaned text to output file
    with open(output_txt_path, "w") as txt_file:
        txt_file.write(cleaned_text)

    # Close the PDF
    pdf_document.close()

# Usage
pdf_path = "../INIGMETICA/AI_Russell_Norvig.pdf"         # Replace with your PDF file path
output_txt_path = "output_text_russell.txt"    # Replace with desired output text file path
extract_text_from_pdf(pdf_path, output_txt_path)

print("Text extraction and cleaning completed.")


Text extraction and cleaning completed.


In [2]:
import re

def clean_text_file(input_txt_path, output_txt_path):
    """
    Reads a text file, removes lines with specific matrix, summation, or symbol patterns,
    and removes lines with fewer than 20 characters. Writes the cleaned content to a new file.
    """
    # Regular expression pattern to identify unwanted lines
    unwanted_patterns = [
        r"=\s*m\s*X\s*xi\s*x⊤\s*i",  # Pattern for "= m X xi x⊤ i"
        r"b\s*=\s*m\s*X\s*yixi",     # Pattern for "b = m X yixi"
        r"A\s*=\s*\(.*\)",           # Matrix-like form pattern
        r"b\s*=\s*\("                # Pattern for matrix b with open parentheses
    ]
    
    # Combine all patterns into one
    combined_pattern = re.compile("|".join(unwanted_patterns))
    
    with open(input_txt_path, "r") as file:
        lines = file.readlines()

    # Filter lines by patterns and length
    filtered_lines = [
        line for line in lines
        if not combined_pattern.search(line) and len(line.strip()) >= 20
    ]

    # Write the cleaned lines to the output file
    with open(output_txt_path, "w") as file:
        file.writelines(filtered_lines)

# Usage
input_txt_path = "output_text_russell.txt"      # Replace with the path to your input text file
output_txt_path = "cleaned_text_russell.txt"   # Replace with the desired output file path
clean_text_file(input_txt_path, output_txt_path)

print("Text cleaning completed.")


Text cleaning completed.


In [None]:
import pandas as pd
import re

def read_and_process_text_file(file_path):
    try:
        # Read the text file
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            
        # Split text into paragraphs using a more specific pattern
        # This pattern looks for:
        # 1. Paragraphs starting with capital letters after line breaks
        # 2. Sections separated by blank lines
        # 3. Numbered or bulleted sections
        paragraphs = []
        current_para = []
        
        # Split into lines first
        lines = text.split('\n')
        
        for line in lines:
            # Remove leading/trailing whitespace
            line = line.strip()
            
            # If line is empty and we have collected some text
            if not line and current_para:
                # Join the collected lines and add to paragraphs
                paragraphs.append(' '.join(current_para))
                current_para = []
            # If line starts with bullet points, numbers, or is a new section
            elif line and (line[0].isupper() or 
                         line[0].isdigit() or 
                         line.startswith('•') or 
                         line.startswith('-')):
                # If we have a previous paragraph, save it
                if current_para:
                    paragraphs.append(' '.join(current_para))
                    current_para = []
                current_para.append(line)
            # If it's a continuation line with content
            elif line:
                current_para.append(line)
        
        # Add the last paragraph if exists
        if current_para:
            paragraphs.append(' '.join(current_para))
        
        # Clean paragraphs
        cleaned_paragraphs = []
        for para in paragraphs:
            # Clean up extra spaces and join lines
            cleaned = ' '.join(para.split())
            if cleaned and len(cleaned) > 10:  # Minimum length to filter out very short segments
                cleaned_paragraphs.append(cleaned)
        
        # Create DataFrame
        df = pd.DataFrame(cleaned_paragraphs, columns=['paragraph'])
        
        # Add paragraph number
        df['paragraph_number'] = range(1, len(df) + 1)
        
        # Reorder columns
        df = df[['paragraph_number', 'paragraph']]
        
        return df
    
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Usage example
file_path = 'cleaned_text.txt'  # Replace with your text file path
df = read_and_process_text_file(file_path)
