In [None]:
import pandas as pd
import PyPDF2

# Function to extract text from a PDF file, split it into paragraphs, and return page numbers
def extract_pdf_text_with_page_numbers(pdf_path, skip_first_lines=1, skip_last_lines=1, skip_first_pages=5):
    pdf_text = ''
    page_numbers = []
    
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
        for page_num in range(skip_first_pages, pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
            page_text = page.extract_text()
            page_lines = page_text.split('\n')
            
            # Skip the specified number of lines from the beginning and end
            page_lines = page_lines[skip_first_lines:-skip_last_lines]
            
            pdf_text += '\n'.join(page_lines) + '\n'  # Separate pages by newlines
            
            # Store page numbers for each paragraph on this page
            page_numbers.extend([page_num] * len(page_lines))
    
    return pdf_text, page_numbers

# Function to split lines into paragraphs
def split_into_paragraphs(lines):
    paragraphs = []
    current_paragraph = []
    for line in lines:
        if line.strip():
            current_paragraph.append(line)
        else:
            if current_paragraph:
                paragraphs.append('\n'.join(current_paragraph))
                current_paragraph = []
    if current_paragraph:
        paragraphs.append('\n'.join(current_paragraph))
    return paragraphs

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Specify the columns for keywords and text
keyword_column = 'keywords'  # Column containing keywords in the keywords Excel file

# Extract text from the PDF file, including page numbers, and skip the first 5 pages and the first and last lines of each page
pdf_path = 'path_to_your_pdf_file.pdf'
pdf_text, page_numbers = extract_pdf_text_with_page_numbers(pdf_path, skip_first_lines=1, skip_last_lines=1, skip_first_pages=5)

# Save the extracted text to a file
with open('extracted_text.txt', 'w', encoding='utf-8') as text_file:
    text_file.write(pdf_text)

# Split the extracted text into paragraphs
pdf_paragraphs = split_into_paragraphs(pdf_text.split('\n'))

# Initialize an empty list to store the matched data
matched_data = []

# Iterate through both datasets and match keywords with text
for keywords_index, keywords_row in keywords_data.iterrows():
    name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
    address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
    keywords = keywords_row[keyword_column].split()  # Assuming keywords are separated by space
    
    print(f"Matching keywords for {name} - {address}: {keywords}")
    
    for page_num, paragraph in enumerate(pdf_paragraphs, 1):
        if isinstance(paragraph, str):  # Check if the paragraph is a string
            # Calculate match scores and matched keywords (excluding stopwords)
            match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, paragraph, stopwords)
            
            # Calculate the match score percentage and convert it to an integer
            match_score_percentage = int(sum(match_scores) / len(keywords) * 100)
            
            # Append the matched data to the list
            matched_data.append({
                'name': name,
                'address': address,
                'keywords': keywords_row[keyword_column],
                'original_paragraph': paragraph,
                'matched_keywords': ','.join(matched_keywords_list),
                'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                'match_score %': match_score_percentage,  # Convert to int
                'page_number': page_num  # Add the page number to the output
            })

# Create a DataFrame from the matched data
matched_df = pd.DataFrame(matched_data)

# Display the DataFrame with matched data
print(matched_df)


In [None]:
import pandas as pd
import PyPDF2

# Function to extract text from a PDF file, split it into paragraphs, and return page numbers
def extract_pdf_text_with_page_numbers(pdf_path, skip_first_lines=1, skip_last_lines=1, skip_first_pages=5):
    pdf_text = ''
    page_numbers = []
    
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
        for page_num in range(skip_first_pages, pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
            page_text = page.extract_text()
            page_lines = page_text.split('\n')
            
            # Skip the specified number of lines from the beginning and end
            page_lines = page_lines[skip_first_lines:-skip_last_lines]
            
            pdf_text += '\n'.join(page_lines) + '\n'  # Separate pages by newlines
            
            # Store page numbers for each paragraph on this page
            page_numbers.extend([page_num] * len(page_lines))
    
    return pdf_text, page_numbers

# Function to split lines into paragraphs
def split_into_paragraphs(lines):
    paragraphs = []
    current_paragraph = []
    for line in lines:
        if line.strip():
            current_paragraph.append(line)
        else:
            if current_paragraph:
                paragraphs.append('\n'.join(current_paragraph))
                current_paragraph = []
    if current_paragraph:
        paragraphs.append('\n'.join(current_paragraph))
    return paragraphs

# Replace 'path_to_your_pdf_file.pdf' with the actual path to your PDF file
pdf_path = 'path_to_your_pdf_file.pdf'

# Extract text from the PDF file, including page numbers, and skip the first 5 pages and the first and last lines of each page
pdf_text, page_numbers = extract_pdf_text_with_page_numbers(pdf_path, skip_first_lines=1, skip_last_lines=1, skip_first_pages=5)

# Split the extracted text into paragraphs
pdf_paragraphs = split_into_paragraphs(pdf_text.split('\n'))

# Create a DataFrame to store the extracted PDF data
pdf_data = pd.DataFrame({
    'Original Paragraph': pdf_paragraphs,
    'Page Number': page_numbers
})

# Save the DataFrame to an Excel file
pdf_data.to_excel('extracted_pdf_data.xlsx', index=False)

# Display a message indicating that the data has been saved
print("Extracted PDF data has been saved to 'extracted_pdf_data.xlsx'")

In [None]:
import pandas as pd
import PyPDF2

# Function to split lines into paragraphs
def split_into_paragraphs(lines):
    paragraphs = []
    current_paragraph = []
    for line in lines:
        if line.strip():
            current_paragraph.append(line)
        else:
            if current_paragraph:
                paragraphs.append('\n'.join(current_paragraph))
                current_paragraph = []
    if current_paragraph:
        paragraphs.append('\n'.join(current_paragraph))
    return paragraphs

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Specify the columns for keywords and text
keyword_column = 'keywords'  # Column containing keywords in the keywords Excel file

# Replace 'path_to_your_pdf_file.pdf' with the actual path to your PDF file
pdf_path = 'path_to_your_pdf_file.pdf'

# Open the PDF file
with open(pdf_path, 'rb') as pdf_file:
    
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(pdf_path).sheet_names:
        
        # Read the data from the current sheet
        sheet_data = pd.read_excel(pdf_path, sheet_name)
        
        # Extract text from the PDF file, including page numbers
        pdf_text, page_numbers = extract_pdf_text_with_page_numbers(pdf_file, skip_first_lines=1, skip_last_lines=1, skip_first_pages=5)
        
        # Split the extracted text into paragraphs
        pdf_paragraphs = split_into_paragraphs(pdf_text.split('\n'))
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row[keyword_column].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for page_num, paragraph in enumerate(pdf_paragraphs, 1):
                if isinstance(paragraph, str):  # Check if the paragraph is a string
                    # Calculate match scores and matched keywords (excluding stopwords)
                    match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, paragraph, stopwords)
                    
                    # Calculate the match score percentage and convert it to an integer
                    match_score_percentage = int(sum(match_scores) / len(keywords) * 100)
                    
                    # Append the matched data to the list
                    matched_data.append({
                        'name': name,
                        'address': address,
                        'keywords': keywords_row[keyword_column],
                        'original_paragraph': paragraph,
                        'matched_keywords': ','.join(matched_keywords_list),
                        'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                        'match_score %': match_score_percentage,  # Convert to int
                        'page_number': page_num  # Add the page number to the output
                    })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel file for the current sheet
        output_file_path = f'matched_output_{sheet_name}.xlsx'
        matched_df.to_excel(output_file_path, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_file_path}'")

In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Iterate through sheets in the Excel file with multiple sheets
for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
    
    # Read the data from the current sheet
    sheet_data = pd.read_excel(excel_file_path, sheet_name)
    
    # Initialize an empty list to store the matched data
    matched_data = []
    
    # Iterate through both datasets and match keywords with text
    for keywords_index, keywords_row in keywords_data.iterrows():
        name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
        address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
        keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
        
        print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
        
        for row_index, row in sheet_data.iterrows():
            text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
            text = row[text_column] if text_column in row else ''
            
            # Calculate match scores and matched keywords (excluding stopwords)
            match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
            
            # Calculate the match score percentage and convert it to an integer
            match_score_percentage = int(sum(match_scores) / len(keywords) * 100)
            
            # Append the matched data to the list
            matched_data.append({
                'name': name,
                'address': address,
                'keywords': keywords_row['keywords'],
                'matched_keywords': ','.join(matched_keywords_list),
                'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                'match_score %': match_score_percentage,  # Convert to int
            })
    
    # Create a DataFrame from the matched data
    matched_df = pd.DataFrame(matched_data)
    
    # Save the DataFrame to an Excel file for the current sheet
    output_file_path = f'matched_output_{sheet_name}.xlsx'
    matched_df.to_excel(output_file_path, index=False)
    
    # Display a message indicating that the data has been saved
    print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_file_path}'")

In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Iterate through sheets in the Excel file with multiple sheets
for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
    
    # Read the data from the current sheet
    sheet_data = pd.read_excel(excel_file_path, sheet_name)
    
    # Initialize an empty list to store the matched data
    matched_data = []
    
    # Iterate through both datasets and match keywords with text
    for keywords_index, keywords_row in keywords_data.iterrows():
        name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
        address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
        keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
        
        print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
        
        for row_index, row in sheet_data.iterrows():
            text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
            text = row[text_column] if text_column in row else ''
            
            # Calculate match scores and matched keywords (excluding stopwords)
            match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
            
            # Calculate the match score percentage and convert it to an integer
            match_score_percentage = int(sum(match_scores) / len(keywords) * 100)
            
            # Append the matched data to the list
            matched_data.append({
                'name': name,
                'address': address,
                'keywords': keywords_row['keywords'],
                'matched_keywords': ','.join(matched_keywords_list),
                'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                'match_score %': match_score_percentage,  # Convert to int
            })
    
    # Create a DataFrame from the matched data
    matched_df = pd.DataFrame(matched_data)
    
    # Save the DataFrame to an Excel file for the current sheet
    output_file_path = f'matched_output_{sheet_name}.xlsx'
    matched_df.to_excel(output_file_path, index=False)
    
    # Display a message indicating that the data has been saved
    print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_file_path}'")


In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return [], []  # Return empty lists if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match scores and matched keywords (excluding stopwords)
                match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Calculate the match score percentage and convert it to an integer
                match_score_percentage = int(sum(match_scores) / len(keywords) * 100)
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text,
                    'matched_keywords': ','.join(matched_keywords_list),
                    'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                    'match_score %': match_score_percentage,  # Convert to int
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")

In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return [], []  # Return empty lists if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match scores and matched keywords (excluding stopwords)
                match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Calculate the match score percentage and convert it to an integer
                match_score_percentage = int(sum(match_scores) / len(keywords) * 100)
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text,
                    'matched_keywords': ','.join(matched_keywords_list),
                    'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                    'match_score %': match_score_percentage,  # Convert to int
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")