In [None]:
import pandas as pd
import PyPDF2

# Function to extract text from a PDF file, split it into paragraphs, and return page numbers
def extract_pdf_text_with_page_numbers(pdf_path, skip_first_lines=1, skip_last_lines=1, skip_first_pages=5):
    pdf_text = ''
    page_numbers = []
    
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
        for page_num in range(skip_first_pages, pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
            page_text = page.extract_text()
            page_lines = page_text.split('\n')
            
            # Skip the specified number of lines from the beginning and end
            page_lines = page_lines[skip_first_lines:-skip_last_lines]
            
            pdf_text += '\n'.join(page_lines) + '\n'  # Separate pages by newlines
            
            # Store page numbers for each paragraph on this page
            page_numbers.extend([page_num] * len(page_lines))
    
    return pdf_text, page_numbers

# Function to split lines into paragraphs
def split_into_paragraphs(lines):
    paragraphs = []
    current_paragraph = []
    for line in lines:
        if line.strip():
            current_paragraph.append(line)
        else:
            if current_paragraph:
                paragraphs.append('\n'.join(current_paragraph))
                current_paragraph = []
    if current_paragraph:
        paragraphs.append('\n'.join(current_paragraph))
    return paragraphs

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Specify the columns for keywords and text
keyword_column = 'keywords'  # Column containing keywords in the keywords Excel file

# Extract text from the PDF file, including page numbers, and skip the first 5 pages and the first and last lines of each page
pdf_path = 'path_to_your_pdf_file.pdf'
pdf_text, page_numbers = extract_pdf_text_with_page_numbers(pdf_path, skip_first_lines=1, skip_last_lines=1, skip_first_pages=5)

# Save the extracted text to a file
with open('extracted_text.txt', 'w', encoding='utf-8') as text_file:
    text_file.write(pdf_text)

# Split the extracted text into paragraphs
pdf_paragraphs = split_into_paragraphs(pdf_text.split('\n'))

# Initialize an empty list to store the matched data
matched_data = []

# Iterate through both datasets and match keywords with text
for keywords_index, keywords_row in keywords_data.iterrows():
    name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
    address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
    keywords = keywords_row[keyword_column].split()  # Assuming keywords are separated by space
    
    print(f"Matching keywords for {name} - {address}: {keywords}")
    
    for page_num, paragraph in enumerate(pdf_paragraphs, 1):
        if isinstance(paragraph, str):  # Check if the paragraph is a string
            # Calculate match scores and matched keywords (excluding stopwords)
            match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, paragraph, stopwords)
            
            # Calculate the match score percentage and convert it to an integer
            match_score_percentage = int(sum(match_scores) / len(keywords) * 100)
            
            # Append the matched data to the list
            matched_data.append({
                'name': name,
                'address': address,
                'keywords': keywords_row[keyword_column],
                'original_paragraph': paragraph,
                'matched_keywords': ','.join(matched_keywords_list),
                'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                'match_score %': match_score_percentage,  # Convert to int
                'page_number': page_num  # Add the page number to the output
            })

# Create a DataFrame from the matched data
matched_df = pd.DataFrame(matched_data)

# Display the DataFrame with matched data
print(matched_df)


In [None]:
import pandas as pd
import PyPDF2

# Function to extract text from a PDF file, split it into paragraphs, and return page numbers
def extract_pdf_text_with_page_numbers(pdf_path, skip_first_lines=1, skip_last_lines=1, skip_first_pages=5):
    pdf_text = ''
    page_numbers = []
    
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
        for page_num in range(skip_first_pages, pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
            page_text = page.extract_text()
            page_lines = page_text.split('\n')
            
            # Skip the specified number of lines from the beginning and end
            page_lines = page_lines[skip_first_lines:-skip_last_lines]
            
            pdf_text += '\n'.join(page_lines) + '\n'  # Separate pages by newlines
            
            # Store page numbers for each paragraph on this page
            page_numbers.extend([page_num] * len(page_lines))
    
    return pdf_text, page_numbers

# Function to split lines into paragraphs
def split_into_paragraphs(lines):
    paragraphs = []
    current_paragraph = []
    for line in lines:
        if line.strip():
            current_paragraph.append(line)
        else:
            if current_paragraph:
                paragraphs.append('\n'.join(current_paragraph))
                current_paragraph = []
    if current_paragraph:
        paragraphs.append('\n'.join(current_paragraph))
    return paragraphs

# Replace 'path_to_your_pdf_file.pdf' with the actual path to your PDF file
pdf_path = 'path_to_your_pdf_file.pdf'

# Extract text from the PDF file, including page numbers, and skip the first 5 pages and the first and last lines of each page
pdf_text, page_numbers = extract_pdf_text_with_page_numbers(pdf_path, skip_first_lines=1, skip_last_lines=1, skip_first_pages=5)

# Split the extracted text into paragraphs
pdf_paragraphs = split_into_paragraphs(pdf_text.split('\n'))

# Create a DataFrame to store the extracted PDF data
pdf_data = pd.DataFrame({
    'Original Paragraph': pdf_paragraphs,
    'Page Number': page_numbers
})

# Save the DataFrame to an Excel file
pdf_data.to_excel('extracted_pdf_data.xlsx', index=False)

# Display a message indicating that the data has been saved
print("Extracted PDF data has been saved to 'extracted_pdf_data.xlsx'")

In [None]:
import pandas as pd
import PyPDF2

# Function to split lines into paragraphs
def split_into_paragraphs(lines):
    paragraphs = []
    current_paragraph = []
    for line in lines:
        if line.strip():
            current_paragraph.append(line)
        else:
            if current_paragraph:
                paragraphs.append('\n'.join(current_paragraph))
                current_paragraph = []
    if current_paragraph:
        paragraphs.append('\n'.join(current_paragraph))
    return paragraphs

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Specify the columns for keywords and text
keyword_column = 'keywords'  # Column containing keywords in the keywords Excel file

# Replace 'path_to_your_pdf_file.pdf' with the actual path to your PDF file
pdf_path = 'path_to_your_pdf_file.pdf'

# Open the PDF file
with open(pdf_path, 'rb') as pdf_file:
    
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(pdf_path).sheet_names:
        
        # Read the data from the current sheet
        sheet_data = pd.read_excel(pdf_path, sheet_name)
        
        # Extract text from the PDF file, including page numbers
        pdf_text, page_numbers = extract_pdf_text_with_page_numbers(pdf_file, skip_first_lines=1, skip_last_lines=1, skip_first_pages=5)
        
        # Split the extracted text into paragraphs
        pdf_paragraphs = split_into_paragraphs(pdf_text.split('\n'))
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row[keyword_column].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for page_num, paragraph in enumerate(pdf_paragraphs, 1):
                if isinstance(paragraph, str):  # Check if the paragraph is a string
                    # Calculate match scores and matched keywords (excluding stopwords)
                    match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, paragraph, stopwords)
                    
                    # Calculate the match score percentage and convert it to an integer
                    match_score_percentage = int(sum(match_scores) / len(keywords) * 100)
                    
                    # Append the matched data to the list
                    matched_data.append({
                        'name': name,
                        'address': address,
                        'keywords': keywords_row[keyword_column],
                        'original_paragraph': paragraph,
                        'matched_keywords': ','.join(matched_keywords_list),
                        'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                        'match_score %': match_score_percentage,  # Convert to int
                        'page_number': page_num  # Add the page number to the output
                    })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel file for the current sheet
        output_file_path = f'matched_output_{sheet_name}.xlsx'
        matched_df.to_excel(output_file_path, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_file_path}'")

In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Iterate through sheets in the Excel file with multiple sheets
for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
    
    # Read the data from the current sheet
    sheet_data = pd.read_excel(excel_file_path, sheet_name)
    
    # Initialize an empty list to store the matched data
    matched_data = []
    
    # Iterate through both datasets and match keywords with text
    for keywords_index, keywords_row in keywords_data.iterrows():
        name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
        address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
        keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
        
        print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
        
        for row_index, row in sheet_data.iterrows():
            text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
            text = row[text_column] if text_column in row else ''
            
            # Calculate match scores and matched keywords (excluding stopwords)
            match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
            
            # Calculate the match score percentage and convert it to an integer
            match_score_percentage = int(sum(match_scores) / len(keywords) * 100)
            
            # Append the matched data to the list
            matched_data.append({
                'name': name,
                'address': address,
                'keywords': keywords_row['keywords'],
                'matched_keywords': ','.join(matched_keywords_list),
                'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                'match_score %': match_score_percentage,  # Convert to int
            })
    
    # Create a DataFrame from the matched data
    matched_df = pd.DataFrame(matched_data)
    
    # Save the DataFrame to an Excel file for the current sheet
    output_file_path = f'matched_output_{sheet_name}.xlsx'
    matched_df.to_excel(output_file_path, index=False)
    
    # Display a message indicating that the data has been saved
    print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_file_path}'")

In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Iterate through sheets in the Excel file with multiple sheets
for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
    
    # Read the data from the current sheet
    sheet_data = pd.read_excel(excel_file_path, sheet_name)
    
    # Initialize an empty list to store the matched data
    matched_data = []
    
    # Iterate through both datasets and match keywords with text
    for keywords_index, keywords_row in keywords_data.iterrows():
        name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
        address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
        keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
        
        print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
        
        for row_index, row in sheet_data.iterrows():
            text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
            text = row[text_column] if text_column in row else ''
            
            # Calculate match scores and matched keywords (excluding stopwords)
            match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
            
            # Calculate the match score percentage and convert it to an integer
            match_score_percentage = int(sum(match_scores) / len(keywords) * 100)
            
            # Append the matched data to the list
            matched_data.append({
                'name': name,
                'address': address,
                'keywords': keywords_row['keywords'],
                'matched_keywords': ','.join(matched_keywords_list),
                'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                'match_score %': match_score_percentage,  # Convert to int
            })
    
    # Create a DataFrame from the matched data
    matched_df = pd.DataFrame(matched_data)
    
    # Save the DataFrame to an Excel file for the current sheet
    output_file_path = f'matched_output_{sheet_name}.xlsx'
    matched_df.to_excel(output_file_path, index=False)
    
    # Display a message indicating that the data has been saved
    print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_file_path}'")


In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return [], []  # Return empty lists if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match scores and matched keywords (excluding stopwords)
                match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Calculate the match score percentage and convert it to an integer
                match_score_percentage = int(sum(match_scores) / len(keywords) * 100)
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text,
                    'matched_keywords': ','.join(matched_keywords_list),
                    'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                    'match_score %': match_score_percentage,  # Convert to int
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")

In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return [], []  # Return empty lists if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match scores and matched keywords (excluding stopwords)
                match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Calculate the match score percentage and convert it to an integer
                match_score_percentage = int(sum(match_scores) / len(keywords) * 100)
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text,
                    'matched_keywords': ','.join(matched_keywords_list),
                    'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                    'match_score %': match_score_percentage,  # Convert to int
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")

In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return [], []  # Return empty lists if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match scores and matched keywords (excluding stopwords)
                match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Calculate the match score percentage based on the length of the original text
                match_score_percentage = int(sum(match_scores) / len(text.split()) * 100) if text else 0
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text,
                    'matched_keywords': ','.join(matched_keywords_list),
                    'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                    'match_score %': match_score_percentage,  # Convert to int
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")


In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return [], []  # Return empty lists if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match scores and matched keywords (excluding stopwords)
                match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Calculate the match score percentage based on the length of unique words in the original text
                unique_words_length = len(set(text.lower().split())) if text else 0
                match_score_percentage = int(sum(match_scores) / unique_words_length * 100) if unique_words_length > 0 else 0
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text,
                    'matched_keywords': ','.join(matched_keywords_list),
                    'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                    'match_score %': match_score_percentage,  # Convert to int
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")


In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return [], []  # Return empty lists if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match scores and matched keywords (excluding stopwords)
                match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Calculate the match score percentage based on the length of unique words in the original text
                unique_words_length = len(set(matched_keywords_list)) if matched_keywords_list else 0
                match_score_percentage = int(unique_words_length / len(set(text.lower().split())) * 100) if text else 0
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text,
                    'matched_keywords': ','.join(matched_keywords_list),
                    'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                    'match_score %': match_score_percentage,  # Convert to int
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")

In [None]:
#Removing stopwords from original text

In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return [], []  # Return empty lists if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Remove stopwords from the original text
    text_parts = [part for part in text_lower.split() if part not in stopwords]
    text_without_stopwords = " ".join(text_parts)
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_without_stopwords for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_without_stopwords]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match scores and matched keywords (excluding stopwords)
                match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Calculate the match score percentage based on the length of the original text
                match_score_percentage = int(sum(match_scores) / len(text_without_stopwords.split()) * 100) if text_without_stopwords else 0
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text_without_stopwords,
                    'matched_keywords': ','.join(matched_keywords_list),
                    'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                    'match_score %': match_score_percentage,  # Convert to int
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")


In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return [], []  # Return empty lists if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Remove stopwords from the original text
    text_parts = [part for part in text_lower.split() if part not in stopwords]
    text_without_stopwords = " ".join(text_parts)
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_without_stopwords for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_without_stopwords]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list, text_without_stopwords

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match scores, matched keywords, and remove stopwords from the original text
                match_scores, matched_keywords_list, text_without_stopwords = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Calculate the match score percentage based on the length of the original text
                match_score_percentage = int(sum(match_scores) / len(text_without_stopwords.split()) * 100) if text_without_stopwords else 0
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text_without_stopwords,
                    'matched_keywords': ','.join(matched_keywords_list),
                    'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
                    'match_score %': match_score_percentage,  # Convert to int
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")


In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return 0, []  # Return 0 if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Remove stopwords from the original text
    text_parts = [part for part in text_lower.split() if part not in stopwords]
    text_without_stopwords = " ".join(text_parts)
    
    # Convert both keywords to lowercase for case-insensitive matching
    keywords_lower = [keyword.lower() for keyword in keywords]
    
    # Filter out stopwords from keywords
    keywords_without_stopwords = [keyword for keyword in keywords_lower if keyword not in stopwords]
    
    # Count the unique matched keywords
    unique_matched_keywords = set(keyword for keyword in keywords_without_stopwords if keyword in text_without_stopwords)
    
    # Calculate the match score percentage based on the count of unique matched keywords
    match_score_percentage = len(unique_matched_keywords) / len(set(text_without_stopwords.split())) if text_without_stopwords else 0
    
    return match_score_percentage, list(unique_matched_keywords)

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match score percentage and unique matched keywords
                match_score_percentage, unique_matched_keywords = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text,
                    'matched_keywords': ','.join(unique_matched_keywords),
                    'match_score %': match_score_percentage,  # This is the match score percentage
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")


In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return 0, []  # Return 0 if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Remove stopwords from the original text
    text_parts = [part for part in text_lower.split() if part not in stopwords]
    text_without_stopwords = " ".join(text_parts)
    
    # Convert both keywords to lowercase for case-insensitive matching
    keywords_lower = [keyword.lower() for keyword in keywords]
    
    # Filter out stopwords from keywords
    keywords_without_stopwords = [keyword for keyword in keywords_lower if keyword not in stopwords]
    
    # Find the unique matched keywords
    unique_matched_keywords = set(keyword for keyword in keywords_without_stopwords if keyword in text_without_stopwords)
    
    # Calculate the match score percentage based on the count of unique matched keywords
    match_score_percentage = len(unique_matched_keywords) / len(set(text_without_stopwords.split())) if text_without_stopwords else 0
    
    return match_score_percentage, list(unique_matched_keywords)

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match score percentage and unique matched keywords
                match_score_percentage, unique_matched_keywords = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text,
                    'matched_keywords': ', '.join(unique_matched_keywords),
                    'match_score %': match_score_percentage,  # This is the match score percentage
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")


In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return 0, []  # Return 0 if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Remove stopwords from the original text
    text_parts = [part for part in text_lower.split() if part not in stopwords]
    text_without_stopwords = " ".join(text_parts)
    
    # Convert both keywords to lowercase for case-insensitive matching
    keywords_lower = [keyword.lower() for keyword in keywords]
    
    # Filter out stopwords from keywords
    keywords_without_stopwords = [keyword for keyword in keywords_lower if keyword not in stopwords]
    
    # Find the unique matched keywords
    unique_matched_keywords = set(keyword for keyword in keywords_without_stopwords if keyword in text_without_stopwords)
    
    # Calculate the match score percentage based on the count of unique matched keywords
    match_score_percentage = len(unique_matched_keywords) / len(set(text_without_stopwords.split())) if text_without_stopwords else 0
    
    return match_score_percentage, list(unique_matched_keywords)

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match score percentage and unique matched keywords
                match_score_percentage, unique_matched_keywords = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text,
                    'matched_keywords': ', '.join(unique_matched_keywords),
                    'match_score %': match_score_percentage,  # This is the match score percentage
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")


In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return 0, []  # Return 0 if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Remove stopwords from the original text
    text_parts = [part for part in text_lower.split() if part not in stopwords]
    text_without_stopwords = " ".join(text_parts)
    
    # Convert both keywords to lowercase for case-insensitive matching
    keywords_lower = [keyword.lower() for keyword in keywords]
    
    # Filter out stopwords from keywords
    keywords_without_stopwords = [keyword for keyword in keywords_lower if keyword not in stopwords]
    
    # Find the unique matched keywords
    matched_keywords = [keyword for keyword in keywords_without_stopwords if keyword in text_without_stopwords]
    
    # Calculate the match score percentage based on the count of unique matched keywords
    match_score_percentage = len(set(matched_keywords)) / len(set(text_without_stopwords.split())) if text_without_stopwords else 0
    
    return match_score_percentage, matched_keywords

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match score percentage and matched keywords
                match_score_percentage, matched_keywords = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text,
                    'matched_keywords': ', '.join(matched_keywords),
                    'match_score %': match_score_percentage,  # This is the match score percentage
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")


In [None]:
# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return 0, ''  # Return 0 if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Remove stopwords from the original text
    text_parts = [part for part in text_lower.split() if part not in stopwords]
    text_without_stopwords = " ".join(text_parts)
    
    # Convert both keywords to lowercase for case-insensitive matching
    keywords_lower = [keyword.lower() for keyword in keywords]
    
    # Filter out stopwords from keywords
    keywords_without_stopwords = [keyword for keyword in keywords_lower if keyword not in stopwords]
    
    # Find the matched keywords
    matched_keywords = [keyword for keyword in keywords_without_stopwords if keyword in text_without_stopwords]
    
    # Calculate the match score percentage based on the count of unique matched keywords
    match_score_percentage = len(set(matched_keywords)) / len(set(text_without_stopwords.split())) if text_without_stopwords else 0
    
    return match_score_percentage, ', '.join(matched_keywords)


In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return 0, ''  # Return 0 if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Remove stopwords from the original text
    text_parts = [part for part in text_lower.split() if part not in stopwords]
    text_without_stopwords = " ".join(text_parts)
    
    # Convert both keywords to lowercase for case-insensitive matching
    keywords_lower = [keyword.lower() for keyword in keywords]
    
    # Filter out stopwords from keywords
    keywords_without_stopwords = [keyword for keyword in keywords_lower if keyword not in stopwords]
    
    # Find the matched keywords
    matched_keywords = [keyword for keyword in keywords_without_stopwords if keyword in text_without_stopwords]
    
    # Calculate the match score percentage based on the count of unique matched keywords
    match_score_percentage = len(set(matched_keywords)) / len(set(text_without_stopwords.split())) if text_without_stopwords else 0
    
    return match_score_percentage, ', '.join(matched_keywords)

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match score percentage and matched keywords
                match_score_percentage, matched_keywords = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text,
                    'matched_keywords': matched_keywords,
                    'match_score %': match_score_percentage,  # This is the match score percentage
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")


In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return 0, ''  # Return 0 if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Remove stopwords from the original text
    text_parts = [part for part in text_lower.split() if part not in stopwords]
    text_without_stopwords = " ".join(text_parts)
    
    # Convert both keywords to lowercase for case-insensitive matching
    keywords_lower = [keyword.lower() for keyword in keywords]
    
    # Filter out stopwords from keywords
    keywords_without_stopwords = [keyword for keyword in keywords_lower if keyword not in stopwords]
    
    # Find the matched keywords
    matched_keywords = [keyword for keyword in keywords_without_stopwords if keyword in text_without_stopwords]
    
    # Calculate the match score percentage based on the count of unique matched keywords
    match_score_percentage = len(set(matched_keywords)) / len(set(text_without_stopwords.split())) if text_without_stopwords else 0
    
    return match_score_percentage, ', '.join(matched_keywords)

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match score percentage and matched keywords
                match_score_percentage, matched_keywords = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Add the "horizontal" column value from the current row in sheet_data
                horizontal_value = row['horizontal']  # Replace 'horizontal' with the actual column name in your Excel file
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text,
                    'matched_keywords': matched_keywords,
                    'match_score %': match_score_percentage,  # This is the match score percentage
                    'horizontal': horizontal_value,  # Adding the "horizontal" column
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")

In [None]:
from PIL import Image
import pytesseract
from openpyxl import Workbook
import os

screenshot_folder = 'path/to/screenshots'

def extract_data_from_screenshot(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)

    # Extract data based on specific patterns or keywords
    name = extract_field(text, "Name:")
    username = extract_field(text, "Username:")
    siterole = extract_field(text, "Siterole:")
    domain = extract_field(text, "Domain:")
    organization = extract_field(text, "Organization:")

    return {'Name': name, 'Username': username, 'Siterole': siterole, 'Domain': domain, 'Organization': organization}

def extract_field(text, field_name):
    # Example: Extract the field value following the field name
    start_index = text.find(field_name)
    if start_index != -1:
        start_index += len(field_name)
        end_index = text.find('\n', start_index)
        if end_index != -1:
            return text[start_index:end_index].strip()

    return None

screenshot_files = os.listdir(screenshot_folder)

extracted_data = []
for file in screenshot_files:
    if file.endswith('.png') or file.endswith('.jpg'):
        image_path = os.path.join(screenshot_folder, file)
        data = extract_data_from_screenshot(image_path)
        data['File'] = file  # Add the file name to the extracted data
        extracted_data.append(data)

# Create Excel workbook
workbook = Workbook()
sheet = workbook.active

# Write headers
headers = ['File', 'Name', 'Username', 'Siterole', 'Domain', 'Organization']
for col_idx, header in enumerate(headers, start=1):
    sheet.cell(row=1, column=col_idx, value=header)

# Write data
for row_idx, data in enumerate(extracted_data, start=2):
    sheet.cell(row=row_idx, column=1, value=data['File'])
    sheet.cell(row=row_idx, column=2, value=data['Name'])
    sheet.cell(row=row_idx, column=3, value=data['Username'])
    sheet.cell(row=row_idx, column=4, value=data['Siterole'])
    sheet.cell(row=row_idx, column=5, value=data['Domain'])
    sheet.cell(row=row_idx, column=6, value=data['Organization'])

# Save the workbook
workbook.save('extracted_data.xlsx')


In [None]:
import pandas as pd

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    if not isinstance(text, str):
        return 0, ''  # Return 0 if the text is not a string
    
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Remove stopwords from the original text
    text_parts = [part for part in text_lower.split() if part not in stopwords]
    text_without_stopwords = " ".join(text_parts)
    
    # Convert both keywords to lowercase for case-insensitive matching
    keywords_lower = [keyword.lower() for keyword in keywords]
    
    # Filter out stopwords from keywords
    keywords_without_stopwords = [keyword for keyword in keywords_lower if keyword not in stopwords]
    
    # Find the matched keywords
    matched_keywords = [keyword for keyword in keywords_without_stopwords if keyword in text_without_stopwords]
    
    # Calculate the match score percentage based on the count of unique matched keywords
    match_score_percentage = len(set(matched_keywords)) / len(set(text_without_stopwords.split())) if text_without_stopwords else 0
    
    return match_score_percentage, ', '.join(matched_keywords)

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Replace 'path_to_your_excel_file.xlsx' with the actual path to your Excel file with multiple sheets
excel_file_path = 'path_to_your_excel_file.xlsx'

# Create an Excel writer to save the matched data for each sheet in a separate sheet
output_excel_path = 'matched_output.xlsx'
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    # Iterate through sheets in the Excel file with multiple sheets
    for sheet_name in pd.ExcelFile(excel_file_path).sheet_names:
        # Read the data from the current sheet
        sheet_data = pd.read_excel(excel_file_path, sheet_name)
        
        # Initialize an empty list to store the matched data
        matched_data = []
        
        # Iterate through both datasets and match keywords with text
        for keywords_index, keywords_row in keywords_data.iterrows():
            name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
            address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
            keywords = keywords_row['keywords'].split()  # Assuming keywords are separated by space
            
            print(f"Matching keywords for {name} - {address} in sheet '{sheet_name}': {keywords}")
            
            for row_index, row in sheet_data.iterrows():
                text_column = 'text_column'  # Replace with the actual column name containing text in your Excel file
                text = row[text_column]
                
                # Calculate match score percentage and matched keywords
                match_score_percentage, matched_keywords = calculate_match_scores_and_matched_keywords(keywords, text, stopwords)
                
                # Add the "horizontal" column value from the current row in sheet_data
                horizontal_value = row['horizontal']  # Replace 'horizontal' with the actual column name in your Excel file
                
                # Add the "Section" column value from the current row in sheet_data
                section_value = row['Section']  # Replace 'Section' with the actual column name in your Excel file
                
                # Add the "Controller" column value from the current row in keywords_data
                controller_value = keywords_row['Controller']  # Assuming 'Controller' is a column in the keywords Excel file
                
                # Append the matched data to the list
                matched_data.append({
                    'name': name,
                    'address': address,
                    'keywords': keywords_row['keywords'],
                    'original_text': text,
                    'matched_keywords': matched_keywords,
                    'match_score %': match_score_percentage,  # This is the match score percentage
                    'horizontal': horizontal_value,  # Adding the "horizontal" column
                    'Section': section_value,  # Adding the "Section" column
                    'Controller': controller_value,  # Adding the "Controller" column
                })
        
        # Create a DataFrame from the matched data
        matched_df = pd.DataFrame(matched_data)
        
        # Save the DataFrame to an Excel sheet for the current sheet
        matched_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
        # Display a message indicating that the data has been saved
        print(f"Matched data for sheet '{sheet_name}' has been saved to '{output_excel_path}'")
