In [None]:
import pandas as pd
import PyPDF2
import re

# Function to extract text from a PDF file and split it into paragraphs
def extract_pdf_text(pdf_path, start_page=6):
    pdf_text = ''
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
        num_pages = pdf_reader.numPages
        
        for page_num in range(start_page - 1, num_pages):
            page = pdf_reader.getPage(page_num)
            page_text = page.extract_text()
            pdf_text += page_text + '\n'  # Separate pages by newlines
    return pdf_text

# Function to split text into paragraphs
def split_into_paragraphs(text):
    return re.split(r'\n\s*\n', text)

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Specify the columns for keywords and text
keyword_column = 'keywords'  # Column containing keywords in the keywords Excel file

# Extract text from the PDF file, starting from the 6th page
pdf_path = 'path_to_your_pdf_file.pdf'
pdf_text = extract_pdf_text(pdf_path)

# Split the extracted text into paragraphs
pdf_paragraphs = split_into_paragraphs(pdf_text)

# Initialize an empty list to store the matched data
matched_data = []

# Iterate through all paragraphs and all keywords
for paragraph in pdf_paragraphs:
    for keywords_index, keywords_row in keywords_data.iterrows():
        name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
        address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
        keywords = keywords_row[keyword_column].split()  # Assuming keywords are separated by space
    
        # Calculate match scores and matched keywords (excluding stopwords)
        match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, paragraph, stopwords)
        
        # Calculate the match score %
        match_score_percentage = sum(match_scores) / len(keywords)
        
        # Append the matched data to the list
        matched_data.append({
            'name': name,
            'address': address,
            'keywords': keywords_row[keyword_column],
            'original_paragraph': paragraph,
            'matched_keywords': ','.join(matched_keywords_list),
            'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
            'match_score %': f'{match_score_percentage * 100:.2f}%'
        })

# Create a DataFrame from the matched data
matched_df = pd.DataFrame(matched_data)

# Display the DataFrame with matched data
print(matched_df)

### Excluding headers and footers

In [None]:
import pandas as pd
import pdfplumber
import re

# Function to extract text from a PDF file and split it into paragraphs
def extract_pdf_text(pdf_path, start_page=6):
    pdf_text = ''
    with pdfplumber.open(pdf_path) as pdf:
        num_pages = len(pdf.pages)

        for page_num in range(start_page - 1, num_pages):
            page = pdf.pages[page_num]

            # Specify regions to exclude (e.g., headers and footers)
            exclude_regions = [
                page.crop((0, 0, page.width, 50)),  # Adjust the height as needed
                page.crop((0, page.height - 50, page.width, page.height)),  # Adjust the height as needed
            ]

            page_text = page.extract_text(x_tolerance=2, y_tolerance=2, boxes_exclude=exclude_regions)
            pdf_text += page_text + '\n'  # Separate pages by newlines

    return pdf_text

# Function to split text into paragraphs
def split_into_paragraphs(text):
    return re.split(r'\n\s*\n', text)

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()

    # Initialize a list to store match scores for each keyword
    match_scores = []

    # Initialize a list to store matched keywords
    matched_keywords_list = []

    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total parts = len(keyword_parts)

        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]

        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)

        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))

    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Specify the columns for keywords and text
keyword_column = 'keywords'  # Column containing keywords in the keywords Excel file

# Extract text from the PDF file, starting from the 6th page
pdf_path = 'path_to_your_pdf_file.pdf'
pdf_text = extract_pdf_text(pdf_path)

# Split the extracted text into paragraphs
pdf_paragraphs = split_into_paragraphs(pdf_text)

# Initialize an empty list to store the matched data
matched_data = []

# Iterate through all paragraphs and all keywords
for paragraph in pdf_paragraphs:
    for keywords_index, keywords_row in keywords_data.iterrows():
        name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
        address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
        keywords = keywords_row[keyword_column].split()  # Assuming keywords are separated by space

        # Calculate match scores and matched keywords (excluding stopwords)
        match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, paragraph, stopwords)

        # Calculate the match score %
        match_score_percentage = sum(match_scores) / len(keywords)

        # Append the matched data to the list
        matched_data.append({
            'name': name,
            'address': address,
            'keywords': keywords_row[keyword_column],
            'original_paragraph': paragraph,
            'matched_keywords': ','.join(matched_keywords_list),
            'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
            'match_score %': f'{match_score_percentage * 100:.2f}%'
        })

# Create a DataFrame from the matched data
matched_df = pd.DataFrame(matched_data)

# Display the DataFrame with matched data
print(matched_df)


In [None]:
import pandas as pd
import fitz  # PyMuPDF

# Function to extract text from a PDF file while excluding header and footer
def extract_pdf_text(pdf_path):
    pdf_text = ''
    doc = fitz.open(pdf_path)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        # Define the page area to exclude (e.g., header and footer coordinates)
        exclude_area = fitz.Rect(0, 0, page.rect.width, 50)  # Excluding top 50 pixels

        # Get text in the page, excluding the defined area
        page_text = page.get_text("text", clip=exclude_area)

        pdf_text += page_text

    return pdf_text

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()

    # Initialize a list to store match scores for each keyword
    match_scores = []

    # Initialize a list to store matched keywords
    matched_keywords_list = []

    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts

        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]

        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)

        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))

    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Specify the columns for keywords and text
keyword_column = 'keywords'  # Column containing keywords in the keywords Excel file

# Extract text from the PDF file (excluding header and footer)
pdf_path = 'path_to_your_pdf_file.pdf'
pdf_text = extract_pdf_text(pdf_path)

# Initialize an empty list to store the matched data
matched_data = []

# Iterate through both datasets and match keywords with text
for keywords_index, keywords_row in keywords_data.iterrows():
    name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
    address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
    keywords = keywords_row[keyword_column].split()  # Assuming keywords are separated by space

    print(f"Matching keywords for {name} - {address}: {keywords}")

    # Split the extracted text into paragraphs by newline while preserving sequence numbers
    paragraphs = [p.strip() for p in pdf_text.split('\n')]

    for paragraph in paragraphs:
        # Calculate match scores and matched keywords (excluding stopwords)
        match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, paragraph, stopwords)

        # Calculate the match score %
        match_score_percentage = sum(match_scores) / len(keywords)

        # Append the matched data to the list
        matched_data.append({
            'name': name,
            'address': address,
            'keywords': keywords_row[keyword_column],
            'original_paragraph': paragraph,
            'matched_keywords': ','.join(matched_keywords_list),
            'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
            'match_score %': f'{match_score_percentage * 100:.2f}%'
        })

# Create a DataFrame from the matched data
matched_df = pd.DataFrame(matched_data)

# Display the DataFrame with matched data
print(matched_df)

In [None]:
import pandas as pd
import fitz  # PyMuPDF

# Function to extract text from a PDF file while excluding header and footer
def extract_pdf_text(pdf_path):
    pdf_text = ''
    doc = fitz.open(pdf_path)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)

        # Define the page height and width
        page_height = page.rect.height
        page_width = page.rect.width

        # Define the regions to exclude (you need to specify the coordinates)
        top_margin = 50  # Exclude top 50 pixels as header
        bottom_margin = 50  # Exclude bottom 50 pixels as footer

        # Get text in the page, excluding the header and footer
        page_text = page.get_text("text", clip=(0, top_margin, page_width, page_height - bottom_margin))

        pdf_text += page_text

    return pdf_text

# Rest of the code (matching keywords) remains the same

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Specify the columns for keywords and text
keyword_column = 'keywords'  # Column containing keywords in the keywords Excel file

# Extract text from the PDF file (excluding header and footer)
pdf_path = 'path_to_your_pdf_file.pdf'
pdf_text = extract_pdf_text(pdf_path)

# Initialize an empty list to store the matched data
matched_data = []

# Iterate through both datasets and match keywords with text
for keywords_index, keywords_row in keywords_data.iterrows():
    name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
    address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
    keywords = keywords_row[keyword_column].split()  # Assuming keywords are separated by space

    print(f"Matching keywords for {name} - {address}: {keywords}")

    # Split the extracted text into paragraphs by newline while preserving sequence numbers
    paragraphs = [p.strip() for p in pdf_text.split('\n')]

    for paragraph in paragraphs:
        # Calculate match scores and matched keywords (excluding stopwords)
        match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, paragraph, stopwords)

        # Calculate the match score %
        match_score_percentage = sum(match_scores) / len(keywords)

        # Append the matched data to the list
        matched_data.append({
            'name': name,
            'address': address,
            'keywords': keywords_row[keyword_column],
            'original_paragraph': paragraph,
            'matched_keywords': ','.join(matched_keywords_list),
            'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
            'match_score %': f'{match_score_percentage * 100:.2f}%'
        })

# Create a DataFrame from the matched data
matched_df = pd.DataFrame(matched_data)

# Display the DataFrame with matched data
print(matched_df)

## To skip first line

In [None]:
import pandas as pd
import PyPDF2
import re

# Function to extract text from a PDF file and split it into paragraphs
def extract_pdf_text(pdf_path, skip_first_line=True):
    pdf_text = ''
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
        for page_num in range(pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
            page_text = page.extract_text()
            if skip_first_line:
                # Split the text into lines and exclude the first line
                lines = page_text.split('\n')
                page_text = '\n'.join(lines[1:])
            pdf_text += page_text + '\n'  # Separate pages by newlines
    return pdf_text

# Function to split text into paragraphs
def split_into_paragraphs(text):
    return re.split(r'\n\s*\n', text)

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Specify the columns for keywords and text
keyword_column = 'keywords'  # Column containing keywords in the keywords Excel file

# Extract text from the PDF file, skipping the first line on each page
pdf_path = 'path_to_your_pdf_file.pdf'
pdf_text = extract_pdf_text(pdf_path, skip_first_line=True)

# Split the extracted text into paragraphs
pdf_paragraphs = split_into_paragraphs(pdf_text)

# Initialize an empty list to store the matched data
matched_data = []

# Iterate through both datasets and match keywords with text
for keywords_index, keywords_row in keywords_data.iterrows():
    name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
    address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
    keywords = keywords_row[keyword_column].split()  # Assuming keywords are separated by space
    
    print(f"Matching keywords for {name} - {address}: {keywords}")
    
    for paragraph in pdf_paragraphs:
        # Calculate match scores and matched keywords (excluding stopwords)
        match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, paragraph, stopwords)
        
        # Calculate the match score %
        match_score_percentage = sum(match_scores) / len(keywords)
        
        # Append the matched data to the list
        matched_data.append({
            'name': name,
            'address': address,
            'keywords': keywords_row[keyword_column],
            'original_paragraph': paragraph,
            'matched_keywords': ','.join(matched_keywords_list),
            'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
            'match_score %': f'{match_score_percentage * 100:.2f}%'
        })

# Create a DataFrame from the matched data
matched_df = pd.DataFrame(matched_data)

# Display the DataFrame with matched data
print(matched_df)


## To skip last line

In [None]:
import pandas as pd
import PyPDF2
import re

# Function to extract text from a PDF file and split it into paragraphs, skipping the last line
def extract_pdf_text(pdf_path, skip_last_line=True):
    pdf_text = ''
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
        for page_num in range(pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
            page_text = page.extract_text()
            if skip_last_line:
                # Split the text into lines and exclude the last line
                lines = page_text.split('\n')
                if lines:
                    page_text = '\n'.join(lines[:-1])
                else:
                    page_text = ''
            pdf_text += page_text + '\n'  # Separate pages by newlines
    return pdf_text

# Function to split text into paragraphs
def split_into_paragraphs(text):
    return re.split(r'\n\s*\n', text)

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Specify the columns for keywords and text
keyword_column = 'keywords'  # Column containing keywords in the keywords Excel file

# Extract text from the PDF file, skipping the last line on each page
pdf_path = 'path_to_your_pdf_file.pdf'
pdf_text = extract_pdf_text(pdf_path, skip_last_line=True)

# Split the extracted text into paragraphs
pdf_paragraphs = split_into_paragraphs(pdf_text)

# Initialize an empty list to store the matched data
matched_data = []

# Iterate through both datasets and match keywords with text
for keywords_index, keywords_row in keywords_data.iterrows():
    name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
    address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
    keywords = keywords_row[keyword_column].split()  # Assuming keywords are separated by space
    
    print(f"Matching keywords for {name} - {address}: {keywords}")
    
    for paragraph in pdf_paragraphs:
        # Calculate match scores and matched keywords (excluding stopwords)
        match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, paragraph, stopwords)
        
        # Calculate the match score %
        match_score_percentage = sum(match_scores) / len(keywords)
        
        # Append the matched data to the list
        matched_data.append({
            'name': name,
            'address': address,
            'keywords': keywords_row[keyword_column],
            'original_paragraph': paragraph,
            'matched_keywords': ','.join(matched_keywords_list),
            'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
            'match_score %': f'{match_score_percentage * 100:.2f}%'
        })

# Create a DataFrame from the matched data
matched_df = pd.DataFrame(matched_data)

# Display the DataFrame with matched data
print(matched_df)


## To skip first and last lin

In [None]:
import pandas as pd
import PyPDF2
import re

# Function to extract text from a PDF file and split it into paragraphs
def extract_pdf_text(pdf_path, skip_first_lines=1, skip_last_lines=1):
    pdf_text = ''
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
        for page_num in range(pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
            page_text = page.extract_text()
            page_lines = page_text.split('\n')
            
            # Skip the specified number of lines from the beginning and end
            page_lines = page_lines[skip_first_lines:-skip_last_lines]
            
            pdf_text += '\n'.join(page_lines) + '\n'  # Separate pages by newlines
    return pdf_text

# Function to split text into paragraphs
def split_into_paragraphs(text):
    return re.split(r'\n\s*\n', text)

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Specify the columns for keywords and text
keyword_column = 'keywords'  # Column containing keywords in the keywords Excel file

# Extract text from the PDF file, skipping the first and last lines of each page
pdf_path = 'path_to_your_pdf_file.pdf'
pdf_text = extract_pdf_text(pdf_path, skip_first_lines=1, skip_last_lines=1)

# Split the extracted text into paragraphs
pdf_paragraphs = split_into_paragraphs(pdf_text)

# Initialize an empty list to store the matched data
matched_data = []

# Iterate through both datasets and match keywords with text
for keywords_index, keywords_row in keywords_data.iterrows():
    name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
    address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
    keywords = keywords_row[keyword_column].split()  # Assuming keywords are separated by space
    
    print(f"Matching keywords for {name} - {address}: {keywords}")
    
    for paragraph in pdf_paragraphs:
        # Calculate match scores and matched keywords (excluding stopwords)
        match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, paragraph, stopwords)
        
        # Calculate the match score %
        match_score_percentage = sum(match_scores) / len(keywords)
        
        # Append the matched data to the list
        matched_data.append({
            'name': name,
            'address': address,
            'keywords': keywords_row[keyword_column],
            'original_paragraph': paragraph,
            'matched_keywords': ','.join(matched_keywords_list),
            'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
            'match_score %': f'{match_score_percentage * 100:.2f}%'
        })

# Create a DataFrame from the matched data
matched_df = pd.DataFrame(matched_data)

# Display the DataFrame with matched data
print(matched_df)

## To skip first 5 pages and first and last line

In [None]:
import pandas as pd
import PyPDF2
import re

# Function to extract text from a PDF file and split it into paragraphs, skipping the first and last line of each page
def extract_pdf_text(pdf_path, skip_pages=5):
    pdf_text = ''
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
        num_pages = pdf_reader.numPages
        
        for page_num in range(skip_pages, num_pages):
            page = pdf_reader.getPage(page_num)
            page_text = page.extract_text()
            page_lines = page_text.split('\n')
            
            # Skip the first and last line of each page
            page_lines = page_lines[1:-1]
            
            pdf_text += '\n'.join(page_lines) + '\n'  # Separate pages by newlines
    return pdf_text

# Function to split text into paragraphs
def split_into_paragraphs(text):
    return re.split(r'\n\s*\n', text)

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Specify the columns for keywords and text
keyword_column = 'keywords'  # Column containing keywords in the keywords Excel file

# Extract text from the PDF file, skipping the first 5 pages and the first and last line of each page
pdf_path = 'path_to_your_pdf_file.pdf'
pdf_text = extract_pdf_text(pdf_path, skip_pages=5)

# Split the extracted text into paragraphs
pdf_paragraphs = split_into_paragraphs(pdf_text)

# Initialize an empty list to store the matched data
matched_data = []

# Iterate through both datasets and match keywords with text
for keywords_index, keywords_row in keywords_data.iterrows():
    name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
    address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
    keywords = keywords_row[keyword_column].split()  # Assuming keywords are separated by space
    
    print(f"Matching keywords for {name} - {address}: {keywords}")
    
    for paragraph in pdf_paragraphs:
        # Calculate match scores and matched keywords (excluding stopwords)
        match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, paragraph, stopwords)
        
        # Calculate the match score %
        match_score_percentage = sum(match_scores) / len(keywords)
        
        # Append the matched data to the list
        matched_data.append({
            'name': name,
            'address': address,
            'keywords': keywords_row[keyword_column],
            'original_paragraph': paragraph,
            'matched_keywords': ','.join(matched_keywords_list),
            'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
            'match_score %': f'{match_score_percentage * 100:.2f}%'
        })

# Create a DataFrame from the matched data
matched_df = pd.DataFrame(matched_data)

# Display the DataFrame with matched data
print(matched_df)

In [None]:
# Function to split lines into paragraphs
def split_into_paragraphs(lines):
    paragraphs = []
    current_paragraph = []
    for line in lines:
        if line.strip():
            current_paragraph.append(line)
        else:
            if current_paragraph:
                paragraphs.append('\n'.join(current_paragraph))
                current_paragraph = []
    if current_paragraph:
        paragraphs.append('\n'.join(current_paragraph))
    return paragraphs