In [1]:
import pandas as pd
import re
import os

## Finding the PSWID 

In [2]:
# File paths (adjust if needed)
community_file = 'PWS_Community.xlsx'
material_file = 'all_files_initial_material.xlsx'


In [3]:
# Read the community spreadsheet 
df_community = pd.read_excel(community_file)
df_community = df_community  

In [4]:
# Assumption: The community file has a column named 'PSWID'
# Convert these values to strings and then to a set for faster lookup
pswid_set = set(df_community['PWSID'].astype(str))

In [5]:
 # Read the material spreadsheet
df_material = pd.read_excel(material_file)

In [6]:
# Define a function to extract a PSWID from a given source file string.
# This regex matches two uppercase letters followed by seven digits.
def extract_pswid(source_str):
    match = re.search(r'CT\d{7}', source_str)
    return match.group(0) if match else None

In [7]:
# Ensure the 'source file' column is a string, then extract PSWID from it.
df_material['extracted_pswid'] = df_material['SOURCE FILE'].astype(str).apply(extract_pswid)


In [8]:
# Check if the extracted PSWID is in the community set
df_material['is_match'] = df_material['extracted_pswid'].apply(lambda x: x in pswid_set if x else False)


In [9]:

# ADD THIS ONE LINE ONLY:
#df_material['Match'] = df_material['extracted_pswid'].isin(pswid_set)

In [10]:
#save the updated DataFrame to a new Excel file
output_file = 'matched_pswids.xlsx'
df_material.to_excel(output_file, index=False)
print(f"Updated DataFrame saved to {output_file}")

Updated DataFrame saved to matched_pswids.xlsx


## Filtering so it only shows the rows where there is an ID match in between the two files

In [11]:
# Optionally, filter and view only the rows with a match
matched_rows = df_material[df_material['is_match']]
print(f"Found {len(matched_rows)} matches out of {len(df_material)} records")
print(matched_rows[['SOURCE FILE', 'extracted_pswid', 'is_match']])

Found 889468 matches out of 889624 records
                                              SOURCE FILE extracted_pswid  \
0        CT0920242_LCRR_Inventory_Initial_10 16 2024.xlsx       CT0920242   
1        CT0920242_LCRR_Inventory_Initial_10 16 2024.xlsx       CT0920242   
2        CT0920242_LCRR_Inventory_Initial_10 16 2024.xlsx       CT0920242   
3          CT0760021_LCRR_Inventory_Initial_10.16.24.xlsx       CT0760021   
4          CT0760021_LCRR_Inventory_Initial_10.16.24.xlsx       CT0760021   
...                                                   ...             ...   
889619  CT0640011_LCRR_Inventory_Initial_10.16.24_Hart...       CT0640011   
889620  CT0640011_LCRR_Inventory_Initial_10.16.24_Hart...       CT0640011   
889621     CT0550274_LCRR_Inventory_Initial_10-14-24.xlsx       CT0550274   
889622     CT0550274_LCRR_Inventory_Initial_10-14-24.xlsx       CT0550274   
889623     CT0550274_LCRR_Inventory_Initial_10-14-24.xlsx       CT0550274   

        is_match  
0           T

In [12]:
# Save the matched results to a new Excel file if needed
matched_rows.to_excel('matched_pswids_only.xlsx', index=False)