In [12]:
# --- Import the Salary Data ---

# Import essential libraries
import pandas as pd              # for working with tabular data
from pathlib import Path          # for handling file paths safely across OS

# Define where the dataset is stored
DATA_PATH = Path('../data')       # path to the data folder
CSV_FILE = DATA_PATH / 'Salaries.csv'  # full path to your CSV file

# Quick confirmation
print('Setup complete! Looking for file at:', CSV_FILE)


Setup complete! Looking for file at: ..\data\Salaries.csv


In [16]:
# --- Load and clean the salary dataset (robust version) ---

class DataLoadError(Exception):
    """Raised when there is an issue loading the salary data file."""
    pass


def load_salary_data(csv_path: Path) -> pd.DataFrame:
    """
    Reads the salary CSV and returns a cleaned pandas DataFrame.
    - Treats common text placeholders as missing values
    - Cleans currency symbols/commas
    - Casts numeric columns to proper numeric types
    - Uses nullable Int64 for integer-like columns
    """

    # 1) Sanity check: the file should exist
    if not csv_path.exists():
        raise DataLoadError(f"⚠️ Data file not found at: {csv_path}")

    # 2) Read CSV with sensible defaults
    try:
        df = pd.read_csv(
            csv_path,
            low_memory=False,                         # better dtype inference
            na_values=["Not Provided", "None", "", "-", "N/A"],  # treat these as NaN
        )
    except Exception as e:
        raise DataLoadError(f"❌ Could not read CSV file. Error: {e}")

    # 3) Tidy up a few text fields (nice-to-have)
    for col in ("EmployeeName", "JobTitle", "Agency"):
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip()

    # 4) Clean + convert numeric columns
    numeric_cols = [
        "BasePay", "OvertimePay", "OtherPay", "Benefits",
        "TotalPay", "TotalPayBenefits"
    ]
    for col in numeric_cols:
        if col in df.columns:
            # remove currency symbols and commas, then coerce to numeric
            df[col] = (
                df[col]
                .astype(str)
                .str.replace(r"[\$,]", "", regex=True)
                .str.strip()
            )
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # 5) Cast integer-like columns to nullable integer (keeps NaN)
    for col in ("Id", "Year"):
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")

    # 6) Friendly confirmation
    loaded_msg = f"✅ Loaded {len(df):,} rows and {df.shape[1]} columns from '{csv_path.name}'"
    if "BasePay" in df.columns:
        loaded_msg += f" | BasePay missing: {int(df['BasePay'].isna().sum()):,}"
    print(loaded_msg)

    return df


In [17]:
# --- Load the data and preview it ---

df = load_salary_data(CSV_FILE)

# Show the first few rows to confirm it loaded correctly
print('Showing the first 5 rows:')
df.head()

✅ Loaded 312,882 rows and 9 columns from 'Salaries.csv' | BasePay missing: 606
Showing the first 5 rows:


Unnamed: 0,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year
0,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011
1,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011
2,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.6,,335279.91,335279.91,2011
3,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916.0,56120.71,198306.9,,332343.61,332343.61,2011
4,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134401.6,9737.0,182234.59,,326373.19,326373.19,2011


In [None]:
D