In [7]:
import pandas as pd
import numpy as np

In [8]:
def find_duplicates(df: pd.DataFrame, subset_cols=None) -> pd.DataFrame:
    """
    Identify duplicate rows in df based on subset_cols. Returns DataFrame of all duplicates.
    """
    if subset_cols is None:
        subset_cols = df.columns.tolist()
    return df[df.duplicated(subset=subset_cols, keep=False)]


In [9]:
def drop_duplicates(df: pd.DataFrame, subset_cols=None) -> pd.DataFrame:
    """
    Drop duplicate rows, keeping the first occurrence.
    """
    return df.drop_duplicates(subset=subset_cols, keep='first')


In [10]:

def save_excel(df: pd.DataFrame, path: str, sheet_name: str = None):
    """
    Save DataFrame to Excel. If sheet_name is provided, use ExcelWriter to set sheet name.
    """
    if sheet_name:
        with pd.ExcelWriter(path) as writer:
            df.to_excel(writer, index=False, sheet_name=sheet_name)
    else:
        df.to_excel(path, index=False)
    print(f"Saved {path}")

In [11]:
def pipeline_initial(input_path: str, duplicates_path: str, dedup_path: str, subset_cols=None):
    # 1) Load initial data
    df = pd.read_excel(input_path)
    # 2) Find duplicates
    dup = find_duplicates(df, subset_cols)
    save_excel(dup, duplicates_path, sheet_name='duplicates')
    # 3) Drop duplicates
    dedup = drop_duplicates(df, subset_cols)
    save_excel(dedup, dedup_path)
    return dedup

In [12]:
def pipeline_merged(input_path: str, duplicates_path: str, dedup_path: str, subset_cols=None) -> pd.DataFrame:
    # 1) Load merged data
    df = pd.read_excel(input_path)
    # 2) Find duplicates in merged data
    dup = find_duplicates(df, subset_cols)
    save_excel(dup, duplicates_path, sheet_name='duplicates_merged')
    # 3) Drop duplicates
    dedup = drop_duplicates(df, subset_cols)
    save_excel(dedup, dedup_path)
    return dedup

In [13]:
def simplify_blanks(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Replace blanks or NaNs in a column with 'N/A'.
    """
    df[column] = df[column].replace([None, np.nan, ''], 'N/A')
    return df

In [14]:
def main():
    # File names
    INITIAL_INPUT = "all_files_initial_material.xlsx"
    INITIAL_DUPLICATES = "duplicates.xlsx"
    INITIAL_DEDUP = "deduplicated.xlsx"

    MERGED_INPUT = "merged_output.xlsx"
    MERGED_DUPLICATES = "duplicate_analysis_results_merged.xlsx"
    MERGED_DEDUP = "merged_deduplicated.xlsx"

    FINAL_SIMPLIFIED = "merged_deduplicated_simplified.xlsx"
    BLANK_COLUMN = 'Entire Service Line Material Classification'

    # Run initial duplicates pipeline
    print("Running initial duplicate detection...")
    pipeline_initial(
        input_path=INITIAL_INPUT,
        duplicates_path=INITIAL_DUPLICATES,
        dedup_path=INITIAL_DEDUP
    )

    # Run merged file duplicates pipeline
    print("Running merged file duplicate detection...")
    dedup_merged = pipeline_merged(
        input_path=MERGED_INPUT,
        duplicates_path=MERGED_DUPLICATES,
        dedup_path=MERGED_DEDUP
    )

    # Simplify blanks in the merged deduplicated df
    print("Filling blanks in classification column...")
    df_final = simplify_blanks(dedup_merged, BLANK_COLUMN)
    save_excel(df_final, FINAL_SIMPLIFIED)

    print("Pipeline complete. Final file:", FINAL_SIMPLIFIED)


In [15]:
if __name__ == '__main__':
    main()

Running initial duplicate detection...
Saved duplicates.xlsx
Saved deduplicated.xlsx
Running merged file duplicate detection...
Saved duplicate_analysis_results_merged.xlsx


KeyboardInterrupt: 