In [7]:
import pandas as pd
import re
import os

## Finding the PSWID 

In [8]:
# File paths (adjust if needed)
community_file = 'PWS_Community.xlsx'
material_file = 'all_files_initial_material.xlsx'


In [9]:
# Read the community spreadsheet and limit to the first 60 rows
df_community = pd.read_excel(community_file)
df_community = df_community.head(60)  # first 60 rows

In [10]:
#show the 60 rows
print("Community DataFrame:")
print(df_community)

Community DataFrame:
        PWSID                               SYSTEM NAME PRINCIPAL CITY SERVED  \
0   CT0930011                  Regional Water Authority             New Haven   
1   CT0150011       Aquarion Water Co of CT-Main System            Bridgeport   
2   CT0640011          Metropolitan District Commission              Hartford   
3   CT0473011        CTWC - Northern Reg-Western System          East Windsor   
4   CT1510011                Waterbury Water Department             Waterbury   
5   CT1350011          Aquarion Water Co of CT-Stamford              Stamford   
6   CT0800011                    Meriden Water Division               Meriden   
7   CT0890011              New Britain Water Department           New Britain   
8   CT0570011         Aquarion Water Co of CT-Greenwich             Greenwich   
9   CT0770021               Manchester Water Department            Manchester   
10  CT0170011                  Bristol Water Department               Bristol   
11  CT0

In [11]:
# Assumption: The community file has a column named 'PSWID'
# Convert these values to strings and then to a set for faster lookup
pswid_set = set(df_community['PWSID'].astype(str))

In [12]:
 # Read the material spreadsheet
df_material = pd.read_excel(material_file)

In [13]:
# Define a function to extract a PSWID from a given source file string.
# This regex matches two uppercase letters followed by seven digits.
def extract_pswid(source_str):
    match = re.search(r'CT\d{7}', source_str)
    return match.group(0) if match else None

In [14]:
# Ensure the 'source file' column is a string, then extract PSWID from it.
df_material['extracted_pswid'] = df_material['SOURCE FILE'].astype(str).apply(extract_pswid)


In [15]:
# Check if the extracted PSWID is in the community set
df_material['is_match'] = df_material['extracted_pswid'].apply(lambda x: x in pswid_set if x else False)


In [16]:

# ADD THIS ONE LINE ONLY:
#df_material['Match'] = df_material['extracted_pswid'].isin(pswid_set)

In [17]:
#save the updated DataFrame to a new Excel file
output_file = 'matched_pswids.xlsx'
df_material.to_excel(output_file, index=False)
print(f"Updated DataFrame saved to {output_file}")

Updated DataFrame saved to matched_pswids.xlsx


## Filtering so it only shows the rows where there is an ID match in between the two files

In [18]:
# Optionally, filter and view only the rows with a match
matched_rows = df_material[df_material['is_match']]
print(f"Found {len(matched_rows)} matches out of {len(df_material)} records")
print(matched_rows[['SOURCE FILE', 'extracted_pswid', 'is_match']])

Found 857751 matches out of 889624 records
                                              SOURCE FILE extracted_pswid  \
29          CT0070021_LCRR_Inventory_Initial_10.7.24.xlsx       CT0070021   
30          CT0070021_LCRR_Inventory_Initial_10.7.24.xlsx       CT0070021   
31          CT0070021_LCRR_Inventory_Initial_10.7.24.xlsx       CT0070021   
32          CT0070021_LCRR_Inventory_Initial_10.7.24.xlsx       CT0070021   
33          CT0070021_LCRR_Inventory_Initial_10.7.24.xlsx       CT0070021   
...                                                   ...             ...   
889616  CT0640011_LCRR_Inventory_Initial_10.16.24_Hart...       CT0640011   
889617  CT0640011_LCRR_Inventory_Initial_10.16.24_Hart...       CT0640011   
889618  CT0640011_LCRR_Inventory_Initial_10.16.24_Hart...       CT0640011   
889619  CT0640011_LCRR_Inventory_Initial_10.16.24_Hart...       CT0640011   
889620  CT0640011_LCRR_Inventory_Initial_10.16.24_Hart...       CT0640011   

        is_match  
29          T

In [19]:
# Save the matched results to a new Excel file if needed
matched_rows.to_excel('matched_pswids_only.xlsx', index=False)

## Matching the ids and getting the system names

In [20]:

# Read the base sheet
df_base = pd.read_excel("matched_pswids_only.xlsx")

In [21]:

# Read the PWS_Community sheet (assuming the sheet name is Sheet1)
df_lookup = pd.read_excel("PWS_Community.xlsx")

In [22]:
# Modify the merge to match on 'PWSID' in one and 'extracted_pswid' in the other
# Assuming df_base has 'extracted_pswid' and df_lookup has 'PWSID'
df_merged = df_base.merge(
    df_lookup[['PWSID', 'SYSTEM NAME']], 
    left_on='extracted_pswid',  # column in df_base
    right_on='PWSID',           # column in df_lookup
    how='left'
)

In [23]:
# Save the merged data to a new Excel file
df_merged.to_excel("merged_output.xlsx", index=False)