In [106]:
import re
import fitz  # PyMuPDF

def clean_text(text):
    """
    Clean extracted text to exclude lines that resemble mathematical expressions,
    index entries, or table of contents entries (e.g., lines with ellipses or numbers).dd
    """
    # Regular expression pattern to identify unwanted lines
    unwanted_patterns = [
        r"\.\.\.+\s*\d+",            # Lines with ellipses followed by numbers
        r"^\d+\.\d+\s",              # Lines starting with a decimal number (e.g., "1.1 ")
        r"\b\d+\b",                  # Lines containing isolated numbers (page numbers, indexes)
        # Cleaning mathematical formule and expression..
        r"[A-Za-z0-9]+[\^\+\-*/=<>]", # Lines with mathematical operators
        r"[A-Za-z0-9]+\s*[∈∀∃∅⊆∪∩≈∑∏∫θμϵλϕδΩ→≤≥]", # Lines with symbols commonly in math
        r"[θμϵδ]+",                  # Greek symbols or Greek-like variables
        r"[<>≤≥=]{2,}",              # Comparison operators often found in math expressions
        r"^\d+$",                    # Lines with only numbers
    ]
    
    # Combine all patterns into one
    combined_pattern = re.compile("|".join(unwanted_patterns))
    
    # Split the text into lines, filter them, and rejoin
    filtered_lines = [
        line for line in text.splitlines()
        if not combined_pattern.search(line)
    ]
    
    return "\n".join(filtered_lines)

def extract_text_from_pdf(pdf_path, output_txt_path):
    """
    Extract text from a PDF, clean it using clean_text function, and save it to a text file.
    """
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    full_text = ""

    # Iterate through pages and extract text
    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        full_text += page.get_text()

    # Clean the extracted text
    cleaned_text = clean_text(full_text)
    
    # Save cleaned text to output file
    with open(output_txt_path, "w") as txt_file:
        txt_file.write(cleaned_text)

    # Close the PDF
    pdf_document.close()

# Usage
pdf_path = "../books/AI_Russell_Norvig.pdf"         # Replace with your PDF file path
output_txt_path = "output_text_russell.txt"    # Replace with desired output text file path
extract_text_from_pdf(pdf_path, output_txt_path)

print("Text extraction and cleaning completed.")


Text extraction and cleaning completed.


# I am checking

In [130]:
import re
import fitz  # PyMuPDF

def clean_text(text):
    """
    Clean extracted text to exclude lines that resemble mathematical expressions,
    index entries, or table of contents entries (e.g., lines with ellipses or numbers).
    """
    # Regular expression pattern to identify unwanted lines
    unwanted_patterns = [
        r"\.\.\.+\s*\d+",            # Lines with ellipses followed by numbers
        r"^\d+\.\d+\s",              # Lines starting with a decimal number (e.g., "1.1 ")
        r"\b\d+\b",                  # Lines containing isolated numbers (page numbers, indexes)
        # Cleaning mathematical formule and expression..
        r"[A-Za-z0-9]+[\^\+\-*/=<>]", # Lines with mathematical operators
        r"[A-Za-z0-9]+\s*[∈∀∃∅⊆∪∩≈∑∏∫θμϵλϕδΩ→≤≥]", # Lines with symbols commonly in math
        r"[θμϵδ]+",                  # Greek symbols or Greek-like variables
        r"[<>≤≥=]{2,}",              # Comparison operators often found in math expressions
        r"^\d+$",                    # Lines with only numbers
    ]
    
    # Combine all patterns into one
    combined_pattern = re.compile("|".join(unwanted_patterns))
    
    # Split the text into lines, filter them, and rejoin
    filtered_lines = [
        line for line in text.splitlines()
        if not combined_pattern.search(line)
    ]
    
    return "\n".join(filtered_lines)

def is_page_unwanted(page_text):
    """
    Determine if a page should be skipped based on its content.
    
    Skips pages starting with "Contents", "Index", or pages with mostly short-form text and numbers.
    
    Parameters:
        page_text (str): The extracted text from the page.
    
    Returns:
        bool: True if the page is unwanted, False otherwise.
    """
    # Check if the page starts with "Contents" or "Index"
    # if re.match(r"^(Contents|Index)|bibliography", page_text, re.IGNORECASE):
    #     return True
    # Split text into lines
    lines = page_text.splitlines()
    
    # Check the first line or second line (if it exists)
    if len(lines) > 0 and re.match(r"^(Contents|Index|Bibliography)", lines[0], re.IGNORECASE):
        return True
    if len(lines) > 1 and re.match(r"^(Contents|Index|Bibliography)", lines[1], re.IGNORECASE):
        return True


    
    # Count the number of lines that are just short-form text (numbers, symbols, etc.)
    
    # short_form_pattern = r"^[A-Za-z0-9\s\+\-\*/=<>≤≥\.,]*$"  # Matches simple short-form lines
    # short_form_lines = [line for line in page_text.splitlines() if re.match(short_form_pattern, line)]
    
    # If more than 70% of the lines are short-form, consider the page unwanted
    # if len(page_text.splitlines()):
    #     if len(short_form_lines) / len(page_text.splitlines()) > 0.8:
    #         return True
    
    return False

def extract_text_from_pdf(pdf_path, output_txt_path):
    """
    Extract text from a PDF, clean it using clean_text function, and save it to a text file.
    Skips pages starting with "Contents", "Index" or pages with high short-form text.
    """
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    full_text = ""

    # Iterate through pages and extract text
    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        page_text = page.get_text()

        # Skip page if it starts with "Contents", "Index", or is mostly short-form text
        if is_page_unwanted(page_text):
            print(f"Skipping page {page_num + 1}: Contains unwanted content")
            continue

        # Clean the extracted text
        cleaned_text = clean_text(page_text)
        full_text += cleaned_text
    
    # Save cleaned text to output file
    with open(output_txt_path, "w") as txt_file:
        txt_file.write(full_text)

    # Close the PDF
    pdf_document.close()

# Usage
pdf_path = "../books/AI_Russell_Norvig.pdf"         # Replace with your PDF file path
output_txt_path = "output_text_russell.txt"    # Replace with desired output text file path
extract_text_from_pdf(pdf_path, output_txt_path)

print("Text extraction and cleaning completed.")


Skipping page 14: Contains unwanted content
Skipping page 15: Contains unwanted content
Skipping page 16: Contains unwanted content
Skipping page 17: Contains unwanted content
Skipping page 18: Contains unwanted content
Skipping page 19: Contains unwanted content
Skipping page 1082: Contains unwanted content
Skipping page 1083: Contains unwanted content
Skipping page 1084: Contains unwanted content
Skipping page 1085: Contains unwanted content
Skipping page 1086: Contains unwanted content
Skipping page 1087: Contains unwanted content
Skipping page 1088: Contains unwanted content
Skipping page 1089: Contains unwanted content
Skipping page 1090: Contains unwanted content
Skipping page 1091: Contains unwanted content
Skipping page 1092: Contains unwanted content
Skipping page 1093: Contains unwanted content
Skipping page 1094: Contains unwanted content
Skipping page 1095: Contains unwanted content
Skipping page 1096: Contains unwanted content
Skipping page 1097: Contains unwanted content


In [108]:
import re

def clean_text_file(input_txt_path, output_txt_path):
    """
    Reads a text file, removes lines with specific matrix, summation, or symbol patterns,
    and removes lines with fewer than 20 characters. Writes the cleaned content to a new file.
    """
    # Regular expression pattern to identify unwanted lines
    unwanted_patterns = [
        r"=\s*m\s*X\s*xi\s*x⊤\s*i",  # Pattern for "= m X xi x⊤ i"
        r"b\s*=\s*m\s*X\s*yixi",     # Pattern for "b = m X yixi"
        r"A\s*=\s*\(.*\)",           # Matrix-like form pattern
        r"b\s*=\s*\("                # Pattern for matrix b with open parentheses
    ]
    
    # Combine all patterns into one
    combined_pattern = re.compile("|".join(unwanted_patterns))
    
    with open(input_txt_path, "r") as file:
        lines = file.readlines()

    # Filter lines by patterns and length
    filtered_lines = [
        line for line in lines
        if not combined_pattern.search(line) and len(line.strip()) >= 20
    ]

    # Write the cleaned lines to the output file
    with open(output_txt_path, "w") as file:
        file.writelines(filtered_lines)

# Usage
input_txt_path = "output_text_russell.txt"      # Replace with the path to your input text file
output_txt_path = "cleaned_text_russell.txt"   # Replace with the desired output file path
clean_text_file(input_txt_path, output_txt_path)

print("Text cleaning completed.")


Text cleaning completed.


# checking


In [131]:
import re

def clean_text_file(input_txt_path, output_txt_path):
    """
    Reads a text file, removes lines with specific unwanted patterns, and filters out lines
    with more than 50% short forms or fewer than 20 characters. Writes the cleaned content to a new file.
    """
    # Regular expression pattern to identify unwanted lines
    unwanted_patterns = [
        r"=\s*m\s*X\s*xi\s*x⊤\s*i",  # Pattern for "= m X xi x⊤ i"
        r"b\s*=\s*m\s*X\s*yixi",     # Pattern for "b = m X yixi"
        r"A\s*=\s*\(.*\)",           # Matrix-like form pattern
        r"b\s*=\s*\("                # Pattern for matrix b with open parentheses
    ]
    
    # Regular expression to identify short forms (numbers, symbols, or very short words)
    short_form_pattern = re.compile(r"^[A-Za-z0-9\s\+\-\*/=<>≤≥\.,!?]*$")
    
    # Combine all unwanted patterns into one
    combined_pattern = re.compile("|".join(unwanted_patterns))
    
    with open(input_txt_path, "r") as file:
        lines = file.readlines()

    # Filter lines based on unwanted patterns, length, and short-form proportion
    filtered_lines = []
    for line in lines:
        line = line.strip()
        if not line:
            continue  # Skip empty lines
        
        # Skip lines matching unwanted patterns
        if combined_pattern.search(line):
            continue
        
        # Skip lines shorter than 20 characters
        if len(line) < 20:
            continue
        
        # Calculate the proportion of short-form elements
        words = line.split()
        short_form_count = sum(1 for word in words if short_form_pattern.match(word) and len(word) <= 3)
        if short_form_count / len(words) > 0.5:
            continue  # Skip if more than 50% of the line is short forms
        
        filtered_lines.append(line + "\n")

    # Write the cleaned lines to the output file
    with open(output_txt_path, "w") as file:
        file.writelines(filtered_lines)

# Usage
input_txt_path = "output_text_russell.txt"      # Replace with the path to your input text file
output_txt_path = "cleaned_text_russell.txt"   # Replace with the desired output file path
clean_text_file(input_txt_path, output_txt_path)

print("Text cleaning completed.")


Text cleaning completed.


In [132]:
import pandas as pd
import re

def read_and_process_text_file(file_path):
    try:
        # Read the text file
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            
        # Split text into paragraphs using a more specific pattern
        # This pattern looks for:
        # 1. Paragraphs starting with capital letters after line breaks
        # 2. Sections separated by blank lines
        # 3. Numbered or bulleted sections
        paragraphs = []
        current_para = []
        
        # Split into lines first
        lines = text.split('\n')
        
        for line in lines:
            # Remove leading/trailing whitespace
            line = line.strip()
            
            # If line is empty and we have collected some text
            if not line and current_para:
                # Join the collected lines and add to paragraphs
                paragraphs.append(' '.join(current_para))
                current_para = []
            # If line starts with bullet points, numbers, or is a new section
            elif line and (line[0].isupper() or 
                         line[0].isdigit() or 
                         line.startswith('•') or 
                         line.startswith('-')):
                # If we have a previous paragraph, save it
                if current_para:
                    paragraphs.append(' '.join(current_para))
                    current_para = []
                current_para.append(line)
            # If it's a continuation line with content
            elif line:
                current_para.append(line)
        
        # Add the last paragraph if exists
        if current_para:
            paragraphs.append(' '.join(current_para))
        
        # Clean paragraphs
        cleaned_paragraphs = []
        for para in paragraphs:
            # Clean up extra spaces and join lines
            cleaned = ' '.join(para.split())
            if cleaned and len(cleaned) > 10:  # Minimum length to filter out very short segments
                cleaned_paragraphs.append(cleaned)
        
        # Create DataFrame
        df = pd.DataFrame(cleaned_paragraphs, columns=['paragraph'])
        
        # Add paragraph number
        df['paragraph_number'] = range(1, len(df) + 1)
        
        # Reorder columns
        df = df[['paragraph_number', 'paragraph']]
        
        return df
    
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Usage example
file_path = '/home/saurabh/Desktop/Enigmatica /Enigmatica/cleaned_text_russell.txt'  # Replace with your text file path
df = read_and_process_text_file(file_path)


In [133]:
df

Unnamed: 0,paragraph_number,paragraph
0,1,Artiﬁcial Intelligence
1,2,Third EditionPRENTICE HALL SERIES
2,3,IN ARTIFICIAL INTELLIGENCE
3,4,"Stuart Russell and Peter Norvig, Editors"
4,5,Computer Vision: A Modern Approach
...,...,...
5313,5314,Example of a generator function and its invoca...
5314,5315,"• Lists: [x, y, z] denotes a list of three ele..."
5315,5316,"• Sets: {x, y, z} denotes a set of three eleme..."
5316,5317,Most of the algorithms in the book have been i...


In [134]:
import pandas as pd
import re

def read_and_process_text_file(file_path):
    try:
        # Read the text file
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Use regex to split text by sentences ending with a period
        # This considers sentences that end with '.', '!', or '?'
        sentence_pattern = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s+'
        sentences = re.split(sentence_pattern, text)
        
        # Combine sentences into paragraphs with a minimum length of 10 characters
        paragraphs = []
        current_para = []
        
        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence) > 10:  # Avoid very short fragments
                current_para.append(sentence)
                # If the current paragraph exceeds a reasonable length, create a new paragraph
                if len(' '.join(current_para)) > 300:  # Example threshold
                    paragraphs.append(' '.join(current_para))
                    current_para = []
        
        # Add the last paragraph if it exists
        if current_para:
            paragraphs.append(' '.join(current_para))
        
        # Create a DataFrame
        df = pd.DataFrame(paragraphs, columns=['paragraph'])
        
        # Add paragraph numbers
        df['paragraph_number'] = range(1, len(df) + 1)
        
        # Reorder columns
        df = df[['paragraph_number', 'paragraph']]
        
        return df
    
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Usage example
file_path = '/home/saurabh/Desktop/Enigmatica /Enigmatica/cleaned_text_russell.txt'  # Replace with your text file path
df = read_and_process_text_file(file_path)

# Save DataFrame to a CSV file (optional)
if df is not None:
    output_path = '/home/saurabh/Desktop/paragraphs.csv'
    df.to_csv(output_path, index=False)
    print(f"Paragraphs saved to {output_path}")

# Print a preview
if df is not None:
    print(df.head())


Paragraphs saved to /home/saurabh/Desktop/paragraphs.csv
   paragraph_number                                          paragraph
0                 1  Artiﬁcial Intelligence\nThird EditionPRENTICE ...
1                 2  Russell and Peter Norvig\nContributing writers...
2                 3  Manufactured in the United States of America. ...
3                 4  To obtain permission(s) to use materials from ...
4                 5  The author and publisher shall\nof, the furnis...


In [135]:
from sentence_transformers import SentenceTransformer

def add_sentence_embeddings(df, text_column='paragraph', embedding_column='sent_embd', model_name='all-MiniLM-L6-v2'):
    """
    Adds sentence embeddings to a DataFrame.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame containing text data.
        text_column (str): The column name in the DataFrame containing text data.
        embedding_column (str): The name of the new column to store embeddings.
        model_name (str): Name of the pre-trained sentence transformer model.
    
    Returns:
        pd.DataFrame: Modified DataFrame with an additional column for embeddings.
    """
    try:
        # Load the pre-trained sentence transformer model
        model = SentenceTransformer(model_name)
        
        # Check if the specified text column exists in the DataFrame
        if text_column not in df.columns:
            raise ValueError(f"Column '{text_column}' not found in DataFrame.")
        
        # Generate embeddings and add them to a new column
        df[embedding_column] = df[text_column].apply(lambda text: model.encode(text))
        
        return df
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [136]:
modified_df = add_sentence_embeddings(df, text_column='paragraph', embedding_column='sent_embd', model_name='all-MiniLM-L6-v2')


In [137]:
modified_df

Unnamed: 0,paragraph_number,paragraph,sent_embd
0,1,Artiﬁcial Intelligence\nThird EditionPRENTICE ...,"[-0.053213518, -0.062415596, 0.046281844, -0.0..."
1,2,Russell and Peter Norvig\nContributing writers...,"[-0.023173211, 0.036421303, 0.010537555, 0.074..."
2,3,Manufactured in the United States of America. ...,"[-0.019787418, -0.0119463345, -0.04162405, -0...."
3,4,To obtain permission(s) to use materials from ...,"[-0.07215656, 0.07856877, 0.011184857, 0.00322..."
4,5,"The author and publisher shall\nof, the furnis...","[0.0009798234, -0.06381129, 0.013659894, -0.03..."
...,...,...,...
4060,4061,"Thus,\npersistent variables are like global va...","[0.022720046, -0.0023750374, -0.09216548, 0.07..."
4061,4062,• Functions as values: Functions and procedure...,"[-0.028717004, 0.017587923, -0.03225768, -0.01..."
4062,4063,and Visual Basic (which use end). swap two var...,"[-0.0754576, 0.027873253, -0.06630628, 0.01281..."
4063,4064,"returns a generator, which in turn yields one ...","[-0.051368438, 0.054829177, -0.10170182, -0.01..."


In [138]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_top_matches(sentence, df, text_column='paragraph', embedding_column='sent_embd', top_n=5, model_name='all-MiniLM-L6-v2'):
    """
    Find the most similar paragraphs to a given sentence based on vector embeddings.
    
    Parameters:
        sentence (str): The input sentence to compare.
        df (pd.DataFrame): DataFrame containing text data and embeddings.
        text_column (str): The column name in the DataFrame containing text data.
        embedding_column (str): The column name containing vector embeddings.
        top_n (int): Number of top matches to return.
        model_name (str): Name of the pre-trained sentence transformer model.
    
    Returns:
        list of tuples: List of top matches as (paragraph, similarity_score).
    """
    try:
        # Load the pre-trained model
        model = SentenceTransformer(model_name)
        
        # Encode the input sentence to get its embedding
        sentence_embedding = model.encode(sentence)
        
        # Ensure embeddings in DataFrame are numpy arrays
        df[embedding_column] = df[embedding_column].apply(lambda x: np.array(x) if isinstance(x, list) else x)
        
        # Compute cosine similarity between the input sentence and all embeddings in the DataFrame
        similarities = cosine_similarity([sentence_embedding], list(df[embedding_column]))
        
        # Get the top N matches
        top_indices = np.argsort(similarities[0])[::-1][:top_n]
        top_matches = [(df.iloc[i][text_column], similarities[0][i]) for i in top_indices]
        
        return top_matches
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return []


In [139]:
modern_ai_questions = [
    "What is the definition of Artificial Intelligence according to ?",
    "What are the main approaches to defining Artificial Intelligence?",
    "What is the Turing Test, and how does it evaluate AI systems?",
    "What are the different types of agents in AI?",
    "Can you explain the PEAS (Performance measure, Environment, Actuators, Sensors) framework for designing agents?",
    "What is the difference between rational agents and intelligent agents?",
    "What are the main categories of AI environments?",
    "What are the characteristics of fully observable vs. partially observable environments?",
    "What is the role of problem-solving in AI?",
    "Can you explain uninformed search strategies like BFS and DFS?",
    "What are heuristic search techniques in AI?",
    "What is the A* algorithm, and why is it considered optimal and complete?",
    "What are adversarial search strategies, and where are they used?",
    "Can you explain the minimax algorithm and alpha-beta pruning?",
    "What are constraint satisfaction problems (CSPs) in AI?",
    "How are logical agents different from other types of agents?",
    "What is propositional logic, and how is it used in AI?",
    "Can you explain first-order logic and its role in AI knowledge representation?",
    "What is the difference between forward chaining and backward chaining in logical reasoning?",
    "How are Bayesian networks used in probabilistic reasoning?",
    "What is the Markov Decision Process (MDP), and how does it apply to AI?",
    "Can you explain the concept of reinforcement learning in AI?",
    "What is Q-learning, and how does it differ from traditional reinforcement learning?",
    "What is the role of deep learning in modern AI systems?",
    "Can you explain the structure of neural networks and their use in AI?",
    "What is the difference between supervised, unsupervised, and reinforcement learning?",
    "What is natural language processing, and what are its main applications?",
    "What is the importance of computer vision in AI?",
    "How do AI systems handle uncertainty in decision-making?",
    "What ethical challenges are associated with modern AI systems?"
]


In [140]:
# Example input sentence
input_sentence =      "What is natural language processing, and what are its main applications?"

# Get the top 3 matches
top_matches = get_top_matches(input_sentence, modified_df, text_column='paragraph', embedding_column='sent_embd', top_n=5)

# Print results
print("Top Matches:")
for match, score in top_matches:
    print(f"Paragraph: {match}\nSimilarity Score: {score:.4f}\n")


Top Matches:
Paragraph: Some systems attempt to analyze the source language text all the way
guage from that representation. This is difﬁcult because it involves three unsolved problems:
creating a complete knowledge representation of everything; parsing into that representation;
and generating sentences from that representation.
Similarity Score: 0.5691

Paragraph: free grammar) are useful tools for dealing with some aspects of natural language. TheBibliographical and Historical Notes
such as the CYK algorithm, which requires grammar rules to be in Chomsky Normal
• A treebank can be used to learn a grammar. It is also possible to learn a grammar from
an unparsed corpus of sentences, but this is less successful.
Similarity Score: 0.5620

Paragraph: So in a trigram model (Markov chain of
We can deﬁne the probability of a sequence of characters P(c1:N) under the trigram model
by ﬁrst factoring with the chain rule and then using the Markov assumption:
entries, and can be accurately estima