In [38]:
import re
import fitz  # PyMuPDF

def clean_text(text):
    """
    Clean extracted text to exclude lines that resemble mathematical expressions,
    index entries, or table of contents entries (e.g., lines with ellipses or numbers).dd
    """
    # Regular expression pattern to identify unwanted lines
    unwanted_patterns = [
        r"\.\.\.+\s*\d+",            # Lines with ellipses followed by numbers
        r"^\d+\.\d+\s",              # Lines starting with a decimal number (e.g., "1.1 ")
        r"\b\d+\b",                  # Lines containing isolated numbers (page numbers, indexes)
        r"[A-Za-z0-9]+[\^\+\-*/=<>]", # Lines with mathematical operators
        r"[A-Za-z0-9]+\s*[∈∀∃∅⊆∪∩≈∑∏∫θμϵλϕδΩ→≤≥]", # Lines with symbols commonly in math
        r"[θμϵδ]+",                  # Greek symbols or Greek-like variables
        r"[<>≤≥=]{2,}",              # Comparison operators often found in math expressions
        r"^\d+$",                    # Lines with only numbers
    ]
    
    # Combine all patterns into one
    combined_pattern = re.compile("|".join(unwanted_patterns))
    
    # Split the text into lines, filter them, and rejoin
    filtered_lines = [
        line for line in text.splitlines()
        if not combined_pattern.search(line)
    ]
    
    return "\n".join(filtered_lines)

def extract_text_from_pdf(pdf_path, output_txt_path):
    """
    Extract text from a PDF, clean it using clean_text function, and save it to a text file.
    """
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    full_text = ""

    # Iterate through pages and extract text
    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        full_text += page.get_text()

    # Clean the extracted text
    cleaned_text = clean_text(full_text)
    
    # Save cleaned text to output file
    with open(output_txt_path, "w") as txt_file:
        txt_file.write(cleaned_text)

    # Close the PDF
    pdf_document.close()

# Usage
pdf_path = "../INIGMETICA/AI_Russell_Norvig.pdf"         # Replace with your PDF file path
output_txt_path = "output_text_russell.txt"    # Replace with desired output text file path
extract_text_from_pdf(pdf_path, output_txt_path)

print("Text extraction and cleaning completed.")


Text extraction and cleaning completed.


In [39]:
import re

def clean_text_file(input_txt_path, output_txt_path):
    """
    Reads a text file, removes lines with specific matrix, summation, or symbol patterns,
    and removes lines with fewer than 20 characters. Writes the cleaned content to a new file.
    """
    # Regular expression pattern to identify unwanted lines
    unwanted_patterns = [
        r"=\s*m\s*X\s*xi\s*x⊤\s*i",  # Pattern for "= m X xi x⊤ i"
        r"b\s*=\s*m\s*X\s*yixi",     # Pattern for "b = m X yixi"
        r"A\s*=\s*\(.*\)",           # Matrix-like form pattern
        r"b\s*=\s*\("                # Pattern for matrix b with open parentheses
    ]
    
    # Combine all patterns into one
    combined_pattern = re.compile("|".join(unwanted_patterns))
    
    with open(input_txt_path, "r") as file:
        lines = file.readlines()

    # Filter lines by patterns and length
    filtered_lines = [
        line for line in lines
        if not combined_pattern.search(line) and len(line.strip()) >= 20
    ]

    # Write the cleaned lines to the output file
    with open(output_txt_path, "w") as file:
        file.writelines(filtered_lines)

# Usage
input_txt_path = "output_text_russell.txt"      # Replace with the path to your input text file
output_txt_path = "cleaned_text_russell.txt"   # Replace with the desired output file path
clean_text_file(input_txt_path, output_txt_path)

print("Text cleaning completed.")


Text cleaning completed.


In [34]:
import pandas as pd
import re

def read_and_process_text_file(file_path):
    try:
        # Read the text file
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            
        # Split text into paragraphs using a more specific pattern
        # This pattern looks for:
        # 1. Paragraphs starting with capital letters after line breaks
        # 2. Sections separated by blank lines
        # 3. Numbered or bulleted sections
        paragraphs = []
        current_para = []
        
        # Split into lines first
        lines = text.split('\n')
        
        for line in lines:
            # Remove leading/trailing whitespace
            line = line.strip()
            
            # If line is empty and we have collected some text
            if not line and current_para:
                # Join the collected lines and add to paragraphs
                paragraphs.append(' '.join(current_para))
                current_para = []
            # If line starts with bullet points, numbers, or is a new section
            elif line and (line[0].isupper() or 
                         line[0].isdigit() or 
                         line.startswith('•') or 
                         line.startswith('-')):
                # If we have a previous paragraph, save it
                if current_para:
                    paragraphs.append(' '.join(current_para))
                    current_para = []
                current_para.append(line)
            # If it's a continuation line with content
            elif line:
                current_para.append(line)
        
        # Add the last paragraph if exists
        if current_para:
            paragraphs.append(' '.join(current_para))
        
        # Clean paragraphs
        cleaned_paragraphs = []
        for para in paragraphs:
            # Clean up extra spaces and join lines
            cleaned = ' '.join(para.split())
            if cleaned and len(cleaned) > 10:  # Minimum length to filter out very short segments
                cleaned_paragraphs.append(cleaned)
        
        # Create DataFrame
        df = pd.DataFrame(cleaned_paragraphs, columns=['paragraph'])
        
        # Add paragraph number
        df['paragraph_number'] = range(1, len(df) + 1)
        
        # Reorder columns
        df = df[['paragraph_number', 'paragraph']]
        
        return df
    
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Usage example
file_path = 'cleaned_text.txt'  # Replace with your text file path
df = read_and_process_text_file(file_path)


In [33]:
import pandas as pd
import re

def process_text_to_df(text):
    # Split text into paragraphs
    # Looking for double newlines or combinations of newlines and spaces
    paragraphs = re.split(r'\n\s*\n', text)
    
    # Clean paragraphs
    cleaned_paragraphs = []
    for para in paragraphs:
        # Remove extra whitespace and newlines
        cleaned = ' '.join(para.split())
        if cleaned:  # Only add non-empty paragraphs
            cleaned_paragraphs.append(cleaned)
    
    # Create DataFrame
    df = pd.DataFrame(cleaned_paragraphs, columns=['paragraph'])
    
    # Add paragraph number
    df['paragraph_number'] = range(1, len(df) + 1)
    
    # Reorder columns
    df = df[['paragraph_number', 'paragraph']]
    
    return df

# Extract the document content
text = """[Your document content here]"""  # Replace with actual document content

# Create DataFrame
df = process_text_to_df(text)

# Display first few rows
print(df.head())

   paragraph_number                     paragraph
0                 1  [Your document content here]


In [36]:
df.to_csv("new.csv")



In [14]:
import re
import fitz  # PyMuPDF

def clean_text(text):
    """
    Clean extracted text to exclude lines that resemble mathematical expressions,
    index entries, or table of contents entries (e.g., lines with ellipses or numbers).dd
    """
    # Regular expression pattern to identify unwanted lines
    unwanted_patterns = [
        r"\.\.\.+\s*\d+",            # Lines with ellipses followed by numbers
        r"^\d+\.\d+\s",              # Lines starting with a decimal number (e.g., "1.1 ")
        r"\b\d+\b",                  # Lines containing isolated numbers (page numbers, indexes)
        r"[A-Za-z0-9]+[\^\+\-*/=<>]", # Lines with mathematical operators
        r"[A-Za-z0-9]+\s*[∈∀∃∅⊆∪∩≈∑∏∫θμϵλϕδΩ→≤≥]", # Lines with symbols commonly in math
        r"[θμϵδ]+",                  # Greek symbols or Greek-like variables
        r"[<>≤≥=]{2,}",              # Comparison operators often found in math expressions
        r"^\d+$",                    # Lines with only numbers
    ]
    
    # Combine all patterns into one
    combined_pattern = re.compile("|".join(unwanted_patterns))
    
    # Split the text into lines, filter them, and rejoin
    filtered_lines = [
        line for line in text.splitlines()
        if not combined_pattern.search(line)
    ]
    
    return "\n".join(filtered_lines)

def extract_text_from_pdf(pdf_path, output_txt_path):
    """
    Extract text from a PDF, clean it using clean_text function, and save it to a text file.
    """
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    full_text = ""

    # Iterate through pages and extract text
    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        full_text += page.get_text()

    # Clean the extracted text
    cleaned_text = clean_text(full_text)
    
    # Save cleaned text to output file
    with open(output_txt_path, "w") as txt_file:
        txt_file.write(cleaned_text)

    # Close the PDF
    pdf_document.close()

# Usage
pdf_path = "Understanding_Machine_Learning_From_Theory_to_Algorithms.pdf"         # Replace with your PDF file path
output_txt_path = "output_text.txt"    # Replace with desired output text file path
extract_text_from_pdf(pdf_path, output_txt_path)

print("Text extraction and cleaning completed.")


Text extraction and cleaning completed.


In [11]:
import re
import fitz  # PyMuPDF
import pandas as pd

def clean_text(text):
    """
    Clean extracted text to exclude lines with only numbers, table of contents-like entries,
    and obvious mathematical symbols. Keep filtering minimal to avoid removing too much text.
    """
    # Minimal patterns to exclude unwanted lines
    unwanted_patterns = [
        r"\.\.\.+\s*\d+",           # Lines with ellipses followed by numbers (e.g., table of contents)
        r"^\d+$",                   # Lines with only numbers (indices or page numbers)
        r"[≥≤∑∫∂∈∀→⇔]+",            # Lines with certain mathematical symbols
    ]
    
    # Combine patterns into a single regex
    combined_pattern = re.compile("|".join(unwanted_patterns))
    
    # Split text by paragraphs, filter, and rejoin
    filtered_paragraphs = [
        paragraph.strip() for paragraph in text.split('\n\n')
        if not combined_pattern.search(paragraph.strip()) and len(paragraph.strip()) > 0
    ]
    
    return filtered_paragraphs

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF, clean it using clean_text function, and return it as a list of paragraphs.
    """
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    full_text = ""

    # Iterate through pages and extract text
    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        full_text += page.get_text()

    # Clean and split the extracted text into paragraphs
    cleaned_paragraphs = clean_text(full_text)
    pdf_document.close()
    return cleaned_paragraphs

def save_paragraphs_to_dataframe(paragraphs):
    """
    Save paragraphs to a DataFrame where each row is a paragraph.
    """
    df = pd.DataFrame(paragraphs, columns=["Paragraph"])
    return df

# Usage
pdf_path = "Understanding_Machine_Learning_From_Theory_to_Algorithms.pdf"   # Replace with your PDF file path
paragraphs = extract_text_from_pdf(pdf_path)

# Check cleaned paragraphs before saving to DataFrame
for i, paragraph in enumerate(paragraphs[:5]):
    print(f"Paragraph {i+1}:", paragraph)

df = save_paragraphs_to_dataframe(paragraphs)
print("DataFrame:")
print(df.head())


DataFrame:
Empty DataFrame
Columns: [Paragraph]
Index: []


In [9]:
cleaned_paragraphs

NameError: name 'cleaned_paragraphs' is not defined

In [7]:
df

Unnamed: 0,Paragraph


In [13]:
import re
import fitz  # PyMuPDF

def clean_text(text):
    """
    Clean extracted text to exclude lines that resemble mathematical expressions,
    index entries, or table of contents entries (e.g., lines with ellipses or numbers).
    """
    # Regular expression pattern to identify unwanted lines
    unwanted_patterns = [
        r"\.\.\.+\s*\d+",       # Lines with ellipses followed by numbers
        r"^\d+\.\d+\s",         # Lines starting with a decimal number (e.g., "1.1 ")
        r"\b\d+\b",             # Lines containing isolated numbers (page numbers, indexes)
        r"\b[0-9]+\b",          # Isolated numbers
        r"^\d+$",               # Lines with only numbers
    ]
    
    # Combine all patterns into one
    combined_pattern = re.compile("|".join(unwanted_patterns))
    
    # Split the text into lines, filter them, and rejoin
    filtered_lines = [
        line for line in text.splitlines()
        if not combined_pattern.search(line)
    ]
    
    return "\n".join(filtered_lines)

def extract_text_from_pdf(pdf_path, output_txt_path):
    """
    Extract text from a PDF, clean it using clean_text function, and save it to a text file.
    """
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    full_text = ""

    # Iterate through pages and extract text
    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        full_text += page.get_text()

    # Clean the extracted text
    cleaned_text = clean_text(full_text)
    
    # Save cleaned text to output file
    with open(output_txt_path, "w") as txt_file:
        txt_file.write(cleaned_text)

    # Close the PDF
    pdf_document.close()

# Usage
pdf_path = "Understanding_Machine_Learning_From_Theory_to_Algorithms.pdf"         # Replace with your PDF file path
output_txt_path = "output_text.txt"    # Replace with desired output text file path
extract_text_from_pdf(pdf_path, output_txt_path)

print("Text extraction and cleaning completed.")


Text extraction and cleaning completed.


In [1]:
import fitz  # PyMuPDF
def scrap_pdf(pdf_path):
    # Path to save the extracted text file
    output_txt_path = "Extracted_Text.txt"

    # Open the PDF file
    with fitz.open(pdf_path) as pdf_document:
        # Open the output text file in write mode
        with open(output_txt_path, "w", encoding="utf-8") as text_file:
            # Loop through each page
            for page_num in range(pdf_document.page_count):
                # Get the page
                page = pdf_document[page_num]
                # Extract text from the page
                text = page.get_text("text")
                # Write text to the file
                text_file.write(f"Page {page_num + 1}\n")
                text_file.write(text)
                text_file.write("\n" + "="*80 + "\n")  # Separator between pages

    print(f"Text extraction complete. The text has been saved to {output_txt_path}")


In [2]:
scrap_pdf("Understanding_Machine_Learning_From_Theory_to_Algorithms.pdf")

Text extraction complete. The text has been saved to Extracted_Text.txt


In [3]:
scrap_pdf("AI_Russell_Norvig.pdf")

Text extraction complete. The text has been saved to Extracted_Text.txt


In [10]:
import pandas as pd

# Read text from a .txt file
with open('Extracted_Text.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Split the text into paragraphs by double newlines
paragraphs = text.split("\n\n")

# Remove paragraphs that are only "=====" or similar patterns
paragraphs = [p for p in paragraphs if not p.strip().startswith('=') and len(p.strip()) > 0]

# Create a DataFrame where each row is a paragraph
df = pd.DataFrame(paragraphs, columns=['Paragraph'])

# Display the DataFrame
print(df)


                                            Paragraph
0   Page 1\nUnderstanding Machine Learning:\nFrom ...
1                                 import pandas as pd
2   # Your complete text, divided by double newlin...
3   # Split the text by double newline to separate...
4   # Create a DataFrame where each row is a parag...
5                  # Display the DataFrame\nprint(df)
6   Page 53\n3.5 Exercises\n53\nof the two trainin...
7                                   m\nX\ni=1\nαiyixi
8   2\n+\nm\nX\ni=1\nαi\n\n1 −yi\n*X\nj\nαjyjxj,...
9                                   \nm\nX\ni=1\nσixi
10                          2\n#\n= E\nσ\n\n\n\n
11                                    m\nX\ni=1\nσixi
12              2\n2\n\n\n1/2\n≤\n\nE\nσ\n\n
13                                    m\nX\ni=1\nσixi
14                  2\n2\n\n\n\n\n1/2\n.\n(26.16)


In [9]:
import pandas as pd
import re

# Read the text file
with open("Extracted_Text.txt", "r") as file:
    text = file.read()

# Split the text by paragraphs (assuming paragraphs are separated by blank lines)
paragraphs = re.split(r'\n\s*\n', text.strip())

# Create a DataFrame with each paragraph as a row
df = pd.DataFrame(paragraphs, columns=["Paragraph"])

# Display or save the DataFrame
print(df)

# Optionally, save to a CSV file
df.to_csv("paragraphs_output.csv", index=False)


                                             Paragraph
0    Page 1\nUnderstanding Machine Learning:\nFrom ...
..                                                 ...

[528 rows x 1 columns]


In [8]:
import pandas as pd

# Read text from a .txt file
with open('Extracted_Text.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Split the text by double newline to separate paragraphs
paragraphs = text.split("\n\n")

# Filter out lines that only contain "=====" or similar patterns
paragraphs = [p for p in paragraphs if not p.strip().startswith('=')]

# Create a DataFrame where each row is a paragraph
df = pd.DataFrame(paragraphs, columns=['Paragraph'])

# Display the DataFrame
print(df)


                                            Paragraph
0   Page 1\nUnderstanding Machine Learning:\nFrom ...
1                                 import pandas as pd
2   # Your complete text, divided by double newlin...
3   # Split the text by double newline to separate...
4   # Create a DataFrame where each row is a parag...
5                  # Display the DataFrame\nprint(df)
6   Page 53\n3.5 Exercises\n53\nof the two trainin...
7                                                    
8                                                    
9                                   m\nX\ni=1\nαiyixi
10                                                   
11                                                   
12  2\n+\nm\nX\ni=1\nαi\n\n1 −yi\n*X\nj\nαjyjxj,...
13                                                   
14                                  \nm\nX\ni=1\nσixi
15                                                   
16                                                   
17                          

In [None]:
import pandas as pd

# Read text from a .txt file
with open('Extracted_Text.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Split the text by double newline to separate paragraphs
paragraphs = text.split("\n\n")

# Create a DataFrame where each row is a paragraph
df = pd.DataFrame(paragraphs, columns=['Paragraph'])

# Display the DataFrame
print(df)


FileNotFoundError: [Errno 2] No such file or directory: 'your_file.txt'

In [6]:
df.head(5)

Unnamed: 0,Paragraph
0,Page 1\nUnderstanding Machine Learning:\nFrom ...
1,==============================================...
2,==============================================...
3,==============================================...
4,==============================================...
