## First, cleaning the file that contains the inital material analysis of all service lines

In [10]:
import pandas as pd

def find_duplicates_in_single_file(file_path):
    """
    Identifies duplicate entries in all_files_initial_material.xlsx based on SITE ID and STREET ADDRESS.
    
    Args:
        file_path (str): Path to the all_files_initial_material.xlsx file
        
    Returns:
        DataFrame: DataFrame containing the duplicate entries
    """
    print(f"Analyzing file: {file_path}")
    
    # Read the Excel file
    try:
        df = pd.read_excel(file_path)
        print(f"Successfully read file with {len(df)} rows")
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        return None
    
    # Find the SITE ID and STREET ADDRESS columns
    site_id_col = None
    street_address_col = None
    
    for col in df.columns:
        if isinstance(col, str):
            if col.upper() == "SITE ID":
                site_id_col = col
            elif "STREET ADDRESS" in col.upper():
                street_address_col = col
    
    if site_id_col is None or street_address_col is None:
        print("Could not find SITE ID and/or STREET ADDRESS columns")
        print("Available columns:", df.columns.tolist())
        return None
    
    print(f"Using columns: '{site_id_col}' and '{street_address_col}'")
    
    # Clean the data for comparison
    df['SITE_ID_CLEAN'] = df[site_id_col].astype(str).str.strip().str.upper()
    df['STREET_ADDRESS_CLEAN'] = df[street_address_col].astype(str).str.strip().str.upper()
    
    # Create a combined key for duplicate checking
    df['SITE_ADDRESS_KEY'] = df['SITE_ID_CLEAN'] + '|' + df['STREET_ADDRESS_CLEAN']
    
    # Find duplicates based on the combined key
    duplicates = df[df.duplicated(subset=['SITE_ADDRESS_KEY'], keep=False)].copy()
    
    # Sort by the key to group duplicates together
    if not duplicates.empty:
        duplicates = duplicates.sort_values('SITE_ADDRESS_KEY')
    
    # Count the number of duplicates
    duplicate_count = duplicates['SITE_ADDRESS_KEY'].nunique()
    total_duplicate_rows = len(duplicates)
    
    print(f"\nResults:")
    print(f"Total rows in file: {len(df)}")
    print(f"Unique SITE ID + STREET ADDRESS combinations: {df['SITE_ADDRESS_KEY'].nunique()}")
    print(f"Number of duplicate combinations: {duplicate_count}")
    print(f"Total rows that are duplicates: {total_duplicate_rows}")
    
    return duplicates

def save_results_to_excel(duplicates, original_df, output_file, deduplicated_file):
    """
    Saves the duplicate results to an Excel file and creates a deduplicated version
    
    Args:
        duplicates (DataFrame): DataFrame containing duplicates
        original_df (DataFrame): Original DataFrame with all data
        output_file (str): Path to save the output Excel file
        deduplicated_file (str): Path to save the deduplicated Excel file
    """
    if duplicates is None or duplicates.empty:
        print("No duplicates to save")
        original_df.to_excel(deduplicated_file, index=False)
        print(f"Original file saved as {deduplicated_file} (no duplicates found)")
        return
    
    # Prepare summary data
    unique_combinations = duplicates['SITE_ADDRESS_KEY'].nunique()
    summary_data = {
        "Metric": [
            "Total duplicate combinations",
            "Total duplicate rows",
            "Original row count",
            "Deduplicated row count",
            "Rows removed"
        ],
        "Value": [
            unique_combinations,
            len(duplicates),
            len(original_df),
            len(original_df) - len(duplicates) + unique_combinations,
            len(duplicates) - unique_combinations
        ]
    }
    summary_df = pd.DataFrame(summary_data)
    
    # Keep only the original columns plus a duplicate group identifier
    result_columns = [col for col in duplicates.columns if not col in 
                     ['SITE_ID_CLEAN', 'STREET_ADDRESS_CLEAN', 'SITE_ADDRESS_KEY']]
    
    # Add a duplicate group identifier
    duplicates['Duplicate_Group'] = duplicates.groupby('SITE_ADDRESS_KEY').ngroup() + 1
    
    # Prepare the final dataframe for export
    export_df = duplicates[result_columns + ['Duplicate_Group']]
    
    # Save to Excel
    with pd.ExcelWriter(output_file) as writer:
        summary_df.to_excel(writer, sheet_name="Summary", index=False)
        export_df.to_excel(writer, sheet_name="Duplicates", index=False)
    
    print(f"Results saved to {output_file}")
    
    # Create a deduplicated version of the file
    # First, mark all rows that are part of duplicates
    original_df['is_duplicate'] = original_df['SITE_ADDRESS_KEY'].isin(duplicates['SITE_ADDRESS_KEY'])
    
    # Get all rows that are not duplicates
    unique_rows = original_df[~original_df['is_duplicate']]
    
    # For each duplicate group, keep only the first occurrence
    duplicate_groups = duplicates['SITE_ADDRESS_KEY'].unique()
    kept_duplicates = []
    
    for group_key in duplicate_groups:
        # Get the first row from each group
        first_row = original_df[original_df['SITE_ADDRESS_KEY'] == group_key].iloc[0]
        kept_duplicates.append(first_row)
    
    # Combine unique rows with one representative from each duplicate group
    deduplicated_df = pd.concat([unique_rows, pd.DataFrame(kept_duplicates)])
    
    # Drop the temporary columns used for deduplication
    deduplicated_df = deduplicated_df.drop(columns=['SITE_ID_CLEAN', 'STREET_ADDRESS_CLEAN', 
                                                    'SITE_ADDRESS_KEY', 'is_duplicate'])
    
    # Save the deduplicated file
    deduplicated_df.to_excel(deduplicated_file, index=False)
    print(f"Deduplicated file saved to {deduplicated_file}")
    print(f"Original row count: {len(original_df)}")
    print(f"Deduplicated row count: {len(deduplicated_df)}")
    print(f"Rows removed: {len(original_df) - len(deduplicated_df)}")

if __name__ == "__main__":
    # Set the file paths
    file_path = "all_files_initial_material.xlsx"
    output_file = "duplicate_analysis_results.xlsx"
    deduplicated_file = "all_files_initial_material_deduplicated.xlsx"
    
    # Find duplicates
    duplicates = find_duplicates_in_single_file(file_path)
    
    # Read the original file again to create the deduplicated version
    try:
        original_df = pd.read_excel(file_path)
        
        # Find the SITE ID and STREET ADDRESS columns
        site_id_col = None
        street_address_col = None
        
        for col in original_df.columns:
            if isinstance(col, str):
                if col.upper() == "SITE ID":
                    site_id_col = col
                elif "STREET ADDRESS" in col.upper():
                    street_address_col = col
        
        # Clean the data for comparison (same as in the duplicates function)
        original_df['SITE_ID_CLEAN'] = original_df[site_id_col].astype(str).str.strip().str.upper()
        original_df['STREET_ADDRESS_CLEAN'] = original_df[street_address_col].astype(str).str.strip().str.upper()
        original_df['SITE_ADDRESS_KEY'] = original_df['SITE_ID_CLEAN'] + '|' + original_df['STREET_ADDRESS_CLEAN']
        
        # Save results and create deduplicated file
        save_results_to_excel(duplicates, original_df, output_file, deduplicated_file)
    except Exception as e:
        print(f"Error processing file for deduplication: {str(e)}")
        if duplicates is not None:
            # Fall back to just saving the duplicates analysis
            save_results_to_excel(duplicates, None, output_file, None)

Analyzing file: all_files_initial_material.xlsx
Successfully read file with 889624 rows
Using columns: 'SITE ID' and 'STREET ADDRESS'

Results:
Total rows in file: 889624
Unique SITE ID + STREET ADDRESS combinations: 815561
Number of duplicate combinations: 72212
Total rows that are duplicates: 146275
Results saved to duplicate_analysis_results.xlsx
Deduplicated file saved to all_files_initial_material_deduplicated.xlsx
Original row count: 889624
Deduplicated row count: 815561
Rows removed: 74063


## Also finding the duplicates, this time for the file that has merged PSWID

In [11]:
def find_duplicates_in_single_file(file_path):
    """
    Identifies duplicate entries in all_files_initial_material.xlsx based on SITE ID and STREET ADDRESS.
    
    Args:
        file_path (str): Path to the all_files_initial_material.xlsx file
        
    Returns:
        DataFrame: DataFrame containing the duplicate entries
    """
    print(f"Analyzing file: {file_path}")
    
    # Read the Excel file
    try:
        df = pd.read_excel(file_path)
        print(f"Successfully read file with {len(df)} rows")
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        return None
    
    # Find the SITE ID and STREET ADDRESS columns
    site_id_col = None
    street_address_col = None
    
    for col in df.columns:
        if isinstance(col, str):
            if col.upper() == "SITE ID":
                site_id_col = col
            elif "STREET ADDRESS" in col.upper():
                street_address_col = col
    
    if site_id_col is None or street_address_col is None:
        print("Could not find SITE ID and/or STREET ADDRESS columns")
        print("Available columns:", df.columns.tolist())
        return None
    
    print(f"Using columns: '{site_id_col}' and '{street_address_col}'")
    
    # Clean the data for comparison
    df['SITE_ID_CLEAN'] = df[site_id_col].astype(str).str.strip().str.upper()
    df['STREET_ADDRESS_CLEAN'] = df[street_address_col].astype(str).str.strip().str.upper()
    
    # Create a combined key for duplicate checking
    df['SITE_ADDRESS_KEY'] = df['SITE_ID_CLEAN'] + '|' + df['STREET_ADDRESS_CLEAN']
    
    # Find duplicates based on the combined key
    duplicates = df[df.duplicated(subset=['SITE_ADDRESS_KEY'], keep=False)].copy()
    
    # Sort by the key to group duplicates together
    if not duplicates.empty:
        duplicates = duplicates.sort_values('SITE_ADDRESS_KEY')
    
    # Count the number of duplicates
    duplicate_count = duplicates['SITE_ADDRESS_KEY'].nunique()
    total_duplicate_rows = len(duplicates)
    
    print(f"\nResults:")
    print(f"Total rows in file: {len(df)}")
    print(f"Unique SITE ID + STREET ADDRESS combinations: {df['SITE_ADDRESS_KEY'].nunique()}")
    print(f"Number of duplicate combinations: {duplicate_count}")
    print(f"Total rows that are duplicates: {total_duplicate_rows}")
    
    return duplicates

def save_results_to_excel(duplicates, original_df, output_file, deduplicated_file):
    """
    Saves the duplicate results to an Excel file and creates a deduplicated version
    
    Args:
        duplicates (DataFrame): DataFrame containing duplicates
        original_df (DataFrame): Original DataFrame with all data
        output_file (str): Path to save the output Excel file
        deduplicated_file (str): Path to save the deduplicated Excel file
    """
    if duplicates is None or duplicates.empty:
        print("No duplicates to save")
        original_df.to_excel(deduplicated_file, index=False)
        print(f"Original file saved as {deduplicated_file} (no duplicates found)")
        return
    
    # Prepare summary data
    unique_combinations = duplicates['SITE_ADDRESS_KEY'].nunique()
    summary_data = {
        "Metric": [
            "Total duplicate combinations",
            "Total duplicate rows",
            "Original row count",
            "Deduplicated row count",
            "Rows removed"
        ],
        "Value": [
            unique_combinations,
            len(duplicates),
            len(original_df),
            len(original_df) - len(duplicates) + unique_combinations,
            len(duplicates) - unique_combinations
        ]
    }
    summary_df = pd.DataFrame(summary_data)
    
    # Keep only the original columns plus a duplicate group identifier
    result_columns = [col for col in duplicates.columns if not col in 
                     ['SITE_ID_CLEAN', 'STREET_ADDRESS_CLEAN', 'SITE_ADDRESS_KEY']]
    
    # Add a duplicate group identifier
    duplicates['Duplicate_Group'] = duplicates.groupby('SITE_ADDRESS_KEY').ngroup() + 1
    
    # Prepare the final dataframe for export
    export_df = duplicates[result_columns + ['Duplicate_Group']]
    
    # Save to Excel
    with pd.ExcelWriter(output_file) as writer:
        summary_df.to_excel(writer, sheet_name="Summary", index=False)
        export_df.to_excel(writer, sheet_name="Duplicates", index=False)
    
    print(f"Results saved to {output_file}")
    
    # Create a deduplicated version of the file
    # First, mark all rows that are part of duplicates
    original_df['is_duplicate'] = original_df['SITE_ADDRESS_KEY'].isin(duplicates['SITE_ADDRESS_KEY'])
    
    # Get all rows that are not duplicates
    unique_rows = original_df[~original_df['is_duplicate']]
    
    # For each duplicate group, keep only the first occurrence
    duplicate_groups = duplicates['SITE_ADDRESS_KEY'].unique()
    kept_duplicates = []
    
    for group_key in duplicate_groups:
        # Get the first row from each group
        first_row = original_df[original_df['SITE_ADDRESS_KEY'] == group_key].iloc[0]
        kept_duplicates.append(first_row)
    
    # Combine unique rows with one representative from each duplicate group
    deduplicated_df = pd.concat([unique_rows, pd.DataFrame(kept_duplicates)])
    
    # Drop the temporary columns used for deduplication
    deduplicated_df = deduplicated_df.drop(columns=['SITE_ID_CLEAN', 'STREET_ADDRESS_CLEAN', 
                                                    'SITE_ADDRESS_KEY', 'is_duplicate'])
    
    # Save the deduplicated file
    deduplicated_df.to_excel(deduplicated_file, index=False)
    print(f"Deduplicated file saved to {deduplicated_file}")
    print(f"Original row count: {len(original_df)}")
    print(f"Deduplicated row count: {len(deduplicated_df)}")
    print(f"Rows removed: {len(original_df) - len(deduplicated_df)}")

if __name__ == "__main__":
    # Set the file paths
    file_path = "merged_output.xlsx"
    output_file = "duplicate_analysis_results_merged.xlsx"
    deduplicated_file = "merged_deduplicated.xlsx"
    
    # Find duplicates
    duplicates = find_duplicates_in_single_file(file_path)
    
    # Read the original file again to create the deduplicated version
    try:
        original_df = pd.read_excel(file_path)
        
        # Find the SITE ID and STREET ADDRESS columns
        site_id_col = None
        street_address_col = None
        
        for col in original_df.columns:
            if isinstance(col, str):
                if col.upper() == "SITE ID":
                    site_id_col = col
                elif "STREET ADDRESS" in col.upper():
                    street_address_col = col
        
        # Clean the data for comparison (same as in the duplicates function)
        original_df['SITE_ID_CLEAN'] = original_df[site_id_col].astype(str).str.strip().str.upper()
        original_df['STREET_ADDRESS_CLEAN'] = original_df[street_address_col].astype(str).str.strip().str.upper()
        original_df['SITE_ADDRESS_KEY'] = original_df['SITE_ID_CLEAN'] + '|' + original_df['STREET_ADDRESS_CLEAN']
        
        # Save results and create deduplicated file
        save_results_to_excel(duplicates, original_df, output_file, deduplicated_file)
    except Exception as e:
        print(f"Error processing file for deduplication: {str(e)}")
        if duplicates is not None:
            # Fall back to just saving the duplicates analysis
            save_results_to_excel(duplicates, None, output_file, None)

Analyzing file: merged_output.xlsx
Successfully read file with 857751 rows
Using columns: 'SITE ID' and 'STREET ADDRESS'

Results:
Total rows in file: 857751
Unique SITE ID + STREET ADDRESS combinations: 784959
Number of duplicate combinations: 71730
Total rows that are duplicates: 144522
Results saved to duplicate_analysis_results_merged.xlsx
Deduplicated file saved to merged_deduplicated.xlsx
Original row count: 857751
Deduplicated row count: 784959
Rows removed: 72792
