In [None]:
import pandas as pd
import PyPDF2
import re

# Function to extract text from a PDF file and split it into paragraphs
def extract_pdf_text(pdf_path, start_page=6):
    pdf_text = ''
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
        num_pages = pdf_reader.numPages
        
        for page_num in range(start_page - 1, num_pages):
            page = pdf_reader.getPage(page_num)
            page_text = page.extract_text()
            pdf_text += page_text + '\n'  # Separate pages by newlines
    return pdf_text

# Function to split text into paragraphs
def split_into_paragraphs(text):
    return re.split(r'\n\s*\n', text)

# Function to calculate match scores and matched keywords while excluding stopwords
def calculate_match_scores_and_matched_keywords(keywords, text, stopwords):
    # Convert both keywords and text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize a list to store match scores for each keyword
    match_scores = []
    
    # Initialize a list to store matched keywords
    matched_keywords_list = []
    
    for keyword in keywords:
        keyword_parts = keyword.lower().split()
        total_parts = len(keyword_parts)
        
        # Filter out stopwords
        keyword_parts = [part for part in keyword_parts if part not in stopwords]
        
        matched_parts = sum(keyword_part in text_lower for keyword_part in keyword_parts)
        match_score = matched_parts / total_parts if total_parts > 0 else 0.0
        match_scores.append(match_score)
        
        # Store the matched keyword (excluding stopwords)
        matched_keywords = [keyword_part for keyword_part in keyword_parts if keyword_part in text_lower]
        matched_keywords_list.append(" ".join(matched_keywords))
    
    return match_scores, matched_keywords_list

# Define a list of stopwords to exclude
stopwords = ["is", "and", "are"]

# Load the dataset with keywords (assuming it's in a separate Excel file)
keywords_excel_path = 'path_to_keywords_excel_file.xlsx'
keywords_data = pd.read_excel(keywords_excel_path)

# Specify the columns for keywords and text
keyword_column = 'keywords'  # Column containing keywords in the keywords Excel file

# Extract text from the PDF file, starting from the 6th page
pdf_path = 'path_to_your_pdf_file.pdf'
pdf_text = extract_pdf_text(pdf_path)

# Split the extracted text into paragraphs
pdf_paragraphs = split_into_paragraphs(pdf_text)

# Initialize an empty list to store the matched data
matched_data = []

# Iterate through all paragraphs and all keywords
for paragraph in pdf_paragraphs:
    for keywords_index, keywords_row in keywords_data.iterrows():
        name = keywords_row['name']  # Assuming 'name' is a column in the keywords Excel file
        address = keywords_row['address']  # Assuming 'address' is a column in the keywords Excel file
        keywords = keywords_row[keyword_column].split()  # Assuming keywords are separated by space
    
        # Calculate match scores and matched keywords (excluding stopwords)
        match_scores, matched_keywords_list = calculate_match_scores_and_matched_keywords(keywords, paragraph, stopwords)
        
        # Calculate the match score %
        match_score_percentage = sum(match_scores) / len(keywords)
        
        # Append the matched data to the list
        matched_data.append({
            'name': name,
            'address': address,
            'keywords': keywords_row[keyword_column],
            'original_paragraph': paragraph,
            'matched_keywords': ','.join(matched_keywords_list),
            'match_scores': ','.join([f'{score:.2f}' for score in match_scores]),
            'match_score %': f'{match_score_percentage * 100:.2f}%'
        })

# Create a DataFrame from the matched data
matched_df = pd.DataFrame(matched_data)

# Display the DataFrame with matched data
print(matched_df)