In [None]:
import pandas as pd
import fitz  # PyMuPDF for PDF extraction

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ''
    for page in doc:
        text += page.get_text()
    return text

# Function to calculate match scores for each keyword
def calculate_match_scores(keywords, text):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
    
    return match_scores

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Specify the columns for keywords and text
keyword_column = 'keywords'  # Column containing keywords in the keywords Excel file

# Initialize an empty list to store the matched data
matched_data = []

# Iterate through both datasets and match keywords with PDF text
for keywords_index, keywords_row in keywords_data.iterrows():
    name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
    address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
    keywords = keywords_row[keyword_column].split()  # Assuming keywords are separated by space
    
    print(f"Matching keywords for {name} - {address}: {keywords}")
    
    # Extract text from the PDF
    pdf_path = 'path_to_your_pdf_file.pdf'
    pdf_text = extract_text_from_pdf(pdf_path)
    
    # Calculate match scores for each keyword based on extracted PDF text
    match_scores = calculate_match_scores(keywords, pdf_text)
    
    # Append the matched data to the list
    matched_data.append({
        'name': name,
        'address': address,
        'keywords': keywords_row[keyword_column],
        'match_scores': ','.join([f'{score:.2f}' for score in match_scores])
    })

# Create a DataFrame from the matched data
matched_df = pd.DataFrame(matched_data)

# Display the DataFrame with matched data
print(matched_df)
