In [7]:
# -*- coding: utf-8 -*-
"""
Notebook 02: Building Unified Entity Profiles

Purpose:
  1. Load the cleaned individual source datasets produced by Notebook 01.
  2. Define the central entity ('Taxpayer') and establish a base list of all
     unique entities across sources.
  3. Join/Merge the different source datasets based on the common identifier
     ('Taxpayer ID').
  4. Handle one-to-many relationships (e.g., multiple properties or directorships
     per taxpayer) through aggregation (calculating counts, sums, means, etc.).
  5. Produce a single, unified DataFrame where each row represents a unique
     taxpayer profile with combined information from all available sources.

Prerequisites:
  - Successful completion of Notebook 01.
  - Existence of cleaned data files (or cleaned DataFrames in memory):
    - 'cleaned_tax_filings.csv'
    - 'cleaned_property_ownership.csv'
    - (Optional) 'cleaned_company_directorships.csv'

Outputs:
  - A Pandas DataFrame (`unified_profile_df`) containing the unified profiles.
  - This DataFrame saved to a CSV file (e.g., 'unified_taxpayer_profiles.csv').

Next Step:
  Notebook 03 will perform feature engineering on this unified profile dataset.
"""

import pandas as pd
import numpy as np
import os

# --- Configuration ---
# Assuming cleaned data was saved in Notebook 01 in a 'cleaned' subdirectory
CLEANED_DATA_DIR = './data/cleaned'
OUTPUT_DIR = './data/processed' # Directory to save the unified profile

TAX_FILE = os.path.join(CLEANED_DATA_DIR, 'cleaned_tax_filings.csv')
PROP_FILE = os.path.join(CLEANED_DATA_DIR, 'cleaned_property_ownership.csv')
COMP_FILE = os.path.join(CLEANED_DATA_DIR, 'cleaned_company_directorships.csv') # Optional

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Notebook 02: Building Unified Entity Profiles")
print("-" * 50)
print(f"Looking for cleaned data in: {CLEANED_DATA_DIR}")
print(f"Output directory for unified profile: {OUTPUT_DIR}")
print("-" * 50)

# =============================================================================
# 1. Load Cleaned Data Sources
# =============================================================================
print("\n[1. Loading Cleaned Data Sources]")

try:
    tax_df_cleaned = pd.read_csv(TAX_FILE)
    print(f"Successfully loaded cleaned Tax Filings data: {tax_df_cleaned.shape}")
except FileNotFoundError:
    print(f"ERROR: Cleaned Tax Filings file not found at {TAX_FILE}.")
    print("Please ensure Notebook 01 was run successfully and saved the cleaned files.")
    raise

try:
    property_df_cleaned = pd.read_csv(PROP_FILE, parse_dates=['Ownership Date']) # Parse dates on load
    print(f"Successfully loaded cleaned Property Ownership data: {property_df_cleaned.shape}")
except FileNotFoundError:
    print(f"ERROR: Cleaned Property Ownership file not found at {PROP_FILE}.")
    print("Please ensure Notebook 01 was run successfully and saved the cleaned files.")
    raise

company_df_cleaned = None
if os.path.exists(COMP_FILE):
    try:
        company_df_cleaned = pd.read_csv(COMP_FILE)
        print(f"Successfully loaded cleaned Company Directorships data: {company_df_cleaned.shape}")
    except Exception as e:
        print(f"Warning: Could not load cleaned Company Directorships file at {COMP_FILE}. Error: {e}")
else:
    print(f"Info: Optional cleaned Company Directorships file not found at {COMP_FILE}. Skipping.")

# Ensure Taxpayer ID is string in all loaded dataframes
tax_df_cleaned['Taxpayer ID'] = tax_df_cleaned['Taxpayer ID'].astype(str)
property_df_cleaned['Taxpayer ID'] = property_df_cleaned['Taxpayer ID'].astype(str)
if company_df_cleaned is not None:
    company_df_cleaned['Taxpayer ID'] = company_df_cleaned['Taxpayer ID'].astype(str)

# =============================================================================
# 2. Define Central Entity and Create Base DataFrame
# =============================================================================
print("\n[2. Define Central Entity and Create Base DataFrame]")
print("Central Entity: Taxpayer")

# Collect all unique Taxpayer IDs from all available sources
all_ids = pd.Series(dtype=str)
all_ids = pd.concat([all_ids, tax_df_cleaned['Taxpayer ID']], ignore_index=True)
all_ids = pd.concat([all_ids, property_df_cleaned['Taxpayer ID']], ignore_index=True)
if company_df_cleaned is not None:
    all_ids = pd.concat([all_ids, company_df_cleaned['Taxpayer ID']], ignore_index=True)

unique_taxpayer_ids = all_ids.unique()
print(f"Found {len(unique_taxpayer_ids)} unique Taxpayer IDs across all sources.")

# Create the base DataFrame containing all unique IDs
base_df = pd.DataFrame({'Taxpayer ID': unique_taxpayer_ids})
print("Created base DataFrame with all unique Taxpayer IDs.")

# =============================================================================
# 3. Process & Merge Tax Filing Data (One-to-One)
# =============================================================================
print("\n[3. Process & Merge Tax Filing Data]")

# Assuming Taxpayer ID is unique in the cleaned tax data (handled in Notebook 01)
# Select relevant columns
tax_data_to_merge = tax_df_cleaned[['Taxpayer ID', 'Declared Income', 'Deductions', 'Sector']].copy()

# Perform a left merge to keep all taxpayers from the base_df
unified_profile_df = pd.merge(base_df, tax_data_to_merge, on='Taxpayer ID', how='left')

print(f"Merged Tax Filing data. Shape after merge: {unified_profile_df.shape}")
print(f"Columns added: {list(tax_data_to_merge.columns[1:])}")

# =============================================================================
# 4. Process, Aggregate & Merge Property Data (One-to-Many)
# =============================================================================
print("\n[4. Process, Aggregate & Merge Property Data]")

# Group by Taxpayer ID and calculate aggregate statistics
print("Aggregating property data per Taxpayer ID...")
property_agg = property_df_cleaned.groupby('Taxpayer ID').agg(
    prop_count=('Property Value', 'count'), # Count non-NA property values as proxy for count
    prop_value_total=('Property Value', 'sum'),
    prop_value_avg=('Property Value', 'mean'),
    prop_value_max=('Property Value', 'max'),
    prop_value_min=('Property Value', 'min'),
    prop_loc_distinct_count=('Location', pd.Series.nunique),
    prop_ownership_earliest=('Ownership Date', 'min'),
    prop_ownership_latest=('Ownership Date', 'max')
).reset_index() # Reset index to make Taxpayer ID a column again

print("Aggregation complete. Sample aggregated property data:")
print(property_agg.head())

# Perform a left merge to add aggregated property data to the unified profile
unified_profile_df = pd.merge(unified_profile_df, property_agg, on='Taxpayer ID', how='left')

print(f"Merged aggregated Property Ownership data. Shape after merge: {unified_profile_df.shape}")
print(f"Columns added: {list(property_agg.columns[1:])}")


# =============================================================================
# 5. Process, Aggregate & Merge Company Data (Optional, One-to-Many)
# =============================================================================
print("\n[5. Process, Aggregate & Merge Company Data (Optional)]")

if company_df_cleaned is not None:
    print("Aggregating company directorship data per Taxpayer ID...")
    # Group by Taxpayer ID and calculate aggregate statistics
    company_agg = company_df_cleaned.groupby('Taxpayer ID').agg(
        directorship_count=('Director Role', 'count'), # Count non-NA roles
        comp_distinct_count=('Company ID', pd.Series.nunique)
    ).reset_index()

    print("Aggregation complete. Sample aggregated company data:")
    print(company_agg.head())

    # Perform a left merge to add aggregated company data
    unified_profile_df = pd.merge(unified_profile_df, company_agg, on='Taxpayer ID', how='left')

    print(f"Merged aggregated Company Directorship data. Shape after merge: {unified_profile_df.shape}")
    print(f"Columns added: {list(company_agg.columns[1:])}")

else:
    print("Skipping Company Directorship merge as the data was not loaded.")


# =============================================================================
# 6. Handle Missing Values Resulting from Joins
# =============================================================================
print("\n[6. Handle Missing Values Resulting from Joins]")
print("Filling NaNs created by left joins (e.g., taxpayers not present in a source).")

# For count and total columns generated by aggregation, NaN means the taxpayer
# wasn't in that source, so the count/total is 0.
count_total_cols = [
    'prop_count', 'prop_value_total', 'prop_loc_distinct_count',
    'directorship_count', 'comp_distinct_count' # Add directorship cols only if they exist
]

for col in count_total_cols:
    if col in unified_profile_df.columns: # Check if column exists (handles optional company data)
        unified_profile_df[col] = unified_profile_df[col].fillna(0).astype(int if 'count' in col else float) # Use appropriate type
        print(f"Filled NaNs in '{col}' with 0.")

# Other columns (e.g., avg, min, max, dates, sector, income, deductions) might
# legitimately be NaN if the taxpayer wasn't in the source OR if the source data
# was missing initially.
# Decision: We will handle these more strategically during Feature Engineering (Notebook 03).
# For now, let's just report the remaining NaNs.

print("\nRemaining missing values after filling 0 for counts/totals:")
print(unified_profile_df.isnull().sum())

# =============================================================================
# 7. Inspect the Unified Profile DataFrame
# =============================================================================
print("\n[7. Inspecting the Unified Profile DataFrame]")

print("First 5 rows of the unified profile:")
print(unified_profile_df.head())

print("\nUnified profile DataFrame Info:")
unified_profile_df.info()

print("\nUnified profile DataFrame Description (Numerical Summary):")
# Select only numeric columns for describe to avoid warnings
numeric_cols = unified_profile_df.select_dtypes(include=np.number).columns.tolist()
print(unified_profile_df[numeric_cols].describe())


# =============================================================================
# 8. Save the Unified Profile DataFrame
# =============================================================================
print("\n[8. Saving the Unified Profile DataFrame]")

output_file_path = os.path.join(OUTPUT_DIR, 'unified_taxpayer_profiles.csv')
try:
    unified_profile_df.to_csv(output_file_path, index=False)
    print(f"Successfully saved unified profiles to: {output_file_path}")
except Exception as e:
    print(f"ERROR saving unified profile file: {e}")

print("-" * 50)

# =============================================================================
# 9. Conclusion
# =============================================================================
print("\n[9. Conclusion]")
print("Notebook 02 finished.")
print("Successfully built the unified taxpayer profile DataFrame by:")
print("  - Loading cleaned data sources.")
print("  - Establishing a base list of all unique taxpayers.")
print("  - Merging tax data (one-to-one).")
print("  - Aggregating and merging property data (one-to-many).")
print("  - Aggregating and merging company data (optional, one-to-many).")
print("  - Performing initial handling of NaNs resulting from joins (filling counts/sums with 0).")
print(f"\nFinal unified profile shape: {unified_profile_df.shape}")
print("The unified profile dataset is saved and ready for feature engineering.")
print("\nProceed to Notebook 03: Feature Engineering on Unified Profiles.")

Notebook 02: Building Unified Entity Profiles
--------------------------------------------------
Looking for cleaned data in: ./data/cleaned
Output directory for unified profile: ./data/processed
--------------------------------------------------

[1. Loading Cleaned Data Sources]
Successfully loaded cleaned Tax Filings data: (4750, 4)
Successfully loaded cleaned Property Ownership data: (4599, 4)
Successfully loaded cleaned Company Directorships data: (825, 3)

[2. Define Central Entity and Create Base DataFrame]
Central Entity: Taxpayer
Found 4906 unique Taxpayer IDs across all sources.
Created base DataFrame with all unique Taxpayer IDs.

[3. Process & Merge Tax Filing Data]
Merged Tax Filing data. Shape after merge: (4906, 4)
Columns added: ['Declared Income', 'Deductions', 'Sector']

[4. Process, Aggregate & Merge Property Data]
Aggregating property data per Taxpayer ID...
Aggregation complete. Sample aggregated property data:
      Taxpayer ID  prop_count  prop_value_total  prop_