In [3]:
import pandas as pd
import re
import os

In [4]:
# File paths (adjust if needed)
community_file = 'PWS_Community (2)(Sheet1).xlsx'
material_file = 'all_files_initial_material.xlsx'


In [5]:
# Read the community spreadsheet and limit to the first 60 rows
df_community = pd.read_excel(community_file)
df_community = df_community.head(60)  # first 60 rows

In [6]:
# Assumption: The community file has a column named 'PSWID'
# Convert these values to strings and then to a set for faster lookup
pswid_set = set(df_community['PWSID'].astype(str))

In [7]:
 # Read the material spreadsheet
df_material = pd.read_excel(material_file)

In [8]:
# Define a function to extract a PSWID from a given source file string.
# This regex matches two uppercase letters followed by seven digits.
def extract_pswid(source_str):
    match = re.search(r'CT\d{7}', source_str)
    return match.group(0) if match else None

In [9]:
# Ensure the 'source file' column is a string, then extract PSWID from it.
df_material['extracted_pswid'] = df_material['SOURCE FILE'].astype(str).apply(extract_pswid)


In [10]:
# Check if the extracted PSWID is in the community set
df_material['is_match'] = df_material['extracted_pswid'].apply(lambda x: x in pswid_set if x else False)


In [11]:

# Optionally, filter and view only the rows with a match
matched_rows = df_material[df_material['is_match']]
print(f"Found {len(matched_rows)} matches out of {len(df_material)} records")
print(matched_rows[['SOURCE FILE', 'extracted_pswid', 'is_match']])

Found 857751 matches out of 889636 records
                                              SOURCE FILE extracted_pswid  \
29          CT0070021_LCRR_Inventory_Initial_10.7.24.xlsx       CT0070021   
30          CT0070021_LCRR_Inventory_Initial_10.7.24.xlsx       CT0070021   
31          CT0070021_LCRR_Inventory_Initial_10.7.24.xlsx       CT0070021   
32          CT0070021_LCRR_Inventory_Initial_10.7.24.xlsx       CT0070021   
33          CT0070021_LCRR_Inventory_Initial_10.7.24.xlsx       CT0070021   
...                                                   ...             ...   
889628  CT0640011_LCRR_Inventory_Initial_10.16.24_Hart...       CT0640011   
889629  CT0640011_LCRR_Inventory_Initial_10.16.24_Hart...       CT0640011   
889630  CT0640011_LCRR_Inventory_Initial_10.16.24_Hart...       CT0640011   
889631  CT0640011_LCRR_Inventory_Initial_10.16.24_Hart...       CT0640011   
889632  CT0640011_LCRR_Inventory_Initial_10.16.24_Hart...       CT0640011   

        is_match  
29          T

In [12]:
# Save the matched results to a new Excel file if needed
matched_rows.to_excel('matched_pswids.xlsx', index=False)