In [1]:
# -*- coding: utf-8 -*-
"""
Notebook 00: Environment Setup & Multi-Source Synthetic Data Generation

Purpose:
  1. Guide the user on setting up the required Python environment.
  2. Generate synthetic datasets representing different sources (Tax Filings,
     Property Ownership, optionally Company Directorships).
  3. Ensure these datasets share common identifiers ('Taxpayer ID') for linking.
  4. Embed plausible cross-source patterns indicative of potential fraud
     (e.g., low declared income + high-value property ownership) for later detection.

Outputs:
  - A configured Python environment with necessary libraries.
  - CSV files saved to the specified data directory:
    - 'synthetic_tax_filings.csv'
    - 'synthetic_property_ownership.csv'
    - (Optional) 'synthetic_company_directorships.csv'

Next Step:
  Notebook 01 will load and explore these generated datasets.
"""

# =============================================================================
# 1. Environment Setup Instructions
# =============================================================================
print("[1. Environment Setup Instructions]")

"""
Instructions:

It is highly recommended to use a dedicated virtual environment (e.g., venv or conda)
to manage project dependencies and avoid conflicts.

1. Create a virtual environment (choose one):
   - Using venv:
     python -m venv .venv
     source .venv/bin/activate  # On Linux/macOS
     .venv\Scripts\activate    # On Windows

   - Using conda:
     conda create --name fraud_mvp python=3.10  # Or your preferred Python version
     conda activate fraud_mvp

2. Install required libraries:
   Run the following pip command in your activated environment:

   pip install pandas numpy scikit-learn matplotlib seaborn jupyterlab <your_chosen_vector_db_client>

   Replace <your_chosen_vector_db_client> with the actual client library, e.g.:
   - chromadb
   - pymilvus
   - pinecone-client

   Example:
   pip install pandas numpy scikit-learn matplotlib seaborn jupyterlab chromadb

3. Launch Jupyter Lab:
   jupyter lab

You can then run this notebook (00) and subsequent notebooks.
"""

print("Please ensure you have followed the environment setup steps above.")
print("Make sure the required libraries (pandas, numpy, etc.) are installed.")
print("-" * 50)

# =============================================================================
# 2. Import Libraries
# =============================================================================
print("[2. Importing Libraries]")

import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import os
import uuid # For generating unique IDs if needed

print("Libraries imported successfully.")
print("-" * 50)

# =============================================================================
# 3. Configuration for Data Generation
# =============================================================================
print("[3. Configuration for Data Generation]")

# --- General Settings ---
NUM_TAXPAYERS = 5000        # Total number of unique taxpayer profiles to simulate
SEED = 42                   # For reproducible random results
OUTPUT_DIR = './data'       # Directory to save the generated CSV files

# --- Tax Filing Data Settings ---
TAX_FILING_RATE = 0.95      # Proportion of taxpayers who file taxes
MIN_INCOME = 15000
MAX_INCOME = 250000
INCOME_LOG_MEAN = np.log(60000) # Center the log-normal distribution around $60k
INCOME_LOG_STD = 0.8
DEDUCTION_RATE_MEAN = 0.15  # Average deduction rate as a fraction of income
DEDUCTION_RATE_STD = 0.05
SECTORS = ['Retail', 'Technology', 'Healthcare', 'Finance', 'Construction', 'Services', 'Manufacturing', 'Education', 'Other']
SECTOR_PROBS = [0.15, 0.15, 0.12, 0.10, 0.10, 0.18, 0.08, 0.07, 0.05] # Must sum to 1

# --- Property Ownership Data Settings ---
PROPERTY_OWNERSHIP_RATE = 0.70 # Proportion of taxpayers who own property
AVG_PROPERTIES_PER_OWNER = 1.3
MIN_PROPERTY_VALUE = 50000
MAX_PROPERTY_VALUE = 2000000
PROP_VALUE_LOG_MEAN = np.log(250000) # Center around $250k
PROP_VALUE_LOG_STD = 0.9
LOCATIONS = ['City Center', 'Suburban North', 'Suburban West', 'Rural East', 'Industrial South', 'Coastal Area']
LOCATION_PROBS = [0.25, 0.20, 0.20, 0.15, 0.10, 0.10] # Must sum to 1
MAX_OWNERSHIP_YEARS_AGO = 20

# --- Company Directorship Data Settings (Optional) ---
GENERATE_DIRECTORSHIPS = True # Set to False to skip this dataset
DIRECTORSHIP_RATE = 0.15    # Proportion of taxpayers who hold directorships
NUM_COMPANIES = int(NUM_TAXPAYERS * 0.2) # Number of unique companies
AVG_DIRECTORSHIPS_PER_DIRECTOR = 1.1
ROLES = ['Director', 'Non-Executive Director', 'Secretary', 'CEO', 'CFO']
ROLE_PROBS = [0.50, 0.20, 0.10, 0.10, 0.10] # Must sum to 1

# --- Fraud Pattern Settings ---
FRAUD_PATTERN_RATIO = 0.03  # Percentage of taxpayers to embed the cross-source pattern in
FRAUD_LOW_INCOME_MAX = 20000
FRAUD_HIGH_PROP_VALUE_MIN = 800000

# --- Noise Settings ---
MISSING_VALUE_RATE_TAX = 0.03      # Rate of missing values in tax data (excluding ID)
MISSING_VALUE_RATE_PROP = 0.04     # Rate of missing values in property data (excluding ID)
MISSING_VALUE_RATE_COMP = 0.02     # Rate of missing values in company data (excluding ID)

# Set random seed for reproducibility
np.random.seed(SEED)
random.seed(SEED)

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Configuration set:")
print(f"  NUM_TAXPAYERS: {NUM_TAXPAYERS}")
print(f"  OUTPUT_DIR: {OUTPUT_DIR}")
print(f"  SEED: {SEED}")
print(f"  GENERATE_DIRECTORSHIPS: {GENERATE_DIRECTORSHIPS}")
print(f"  FRAUD_PATTERN_RATIO: {FRAUD_PATTERN_RATIO}")
print("-" * 50)

# =============================================================================
# 4. Generate Base Taxpayer IDs
# =============================================================================
print("[4. Generating Base Taxpayer IDs]")

taxpayer_ids = [f"TXP_{uuid.uuid4().hex[:10].upper()}" for _ in range(NUM_TAXPAYERS)]
print(f"Generated {len(taxpayer_ids)} unique Taxpayer IDs.")
# print("Sample IDs:", taxpayer_ids[:5])
print("-" * 50)

# =============================================================================
# 5. Generate Synthetic Tax Filings Data
# =============================================================================
print("[5. Generating Synthetic Tax Filings Data]")

# Select subset of taxpayers who file taxes
num_filers = int(NUM_TAXPAYERS * TAX_FILING_RATE)
filer_ids = np.random.choice(taxpayer_ids, size=num_filers, replace=False)

tax_data = []
for tax_id in filer_ids:
    income = np.random.lognormal(mean=INCOME_LOG_MEAN, sigma=INCOME_LOG_STD)
    income = max(MIN_INCOME, min(income, MAX_INCOME)) # Clamp income within bounds

    deduction_rate = np.random.normal(loc=DEDUCTION_RATE_MEAN, scale=DEDUCTION_RATE_STD)
    deduction_rate = max(0.01, min(deduction_rate, 0.5)) # Clamp deduction rate
    deductions = income * deduction_rate
    deductions = max(0, deductions) # Ensure non-negative

    sector = np.random.choice(SECTORS, p=SECTOR_PROBS)

    tax_data.append({
        'Taxpayer ID': tax_id,
        'Declared Income': round(income, 2),
        'Deductions': round(deductions, 2),
        'Sector': sector
    })

tax_df = pd.DataFrame(tax_data)
print(f"Generated initial tax filings data for {len(tax_df)} taxpayers.")

# --- Embed Fraud Pattern (Low Income for specific IDs) ---
num_fraud_pattern = int(NUM_TAXPAYERS * FRAUD_PATTERN_RATIO)
# Ensure we select from IDs present in the tax filings
fraud_candidate_ids = tax_df['Taxpayer ID'].unique()
fraud_ids_for_pattern = np.random.choice(fraud_candidate_ids, size=min(num_fraud_pattern, len(fraud_candidate_ids)), replace=False)

print(f"Selected {len(fraud_ids_for_pattern)} Taxpayer IDs to embed cross-source fraud pattern.")

# Override income for these selected IDs
fraud_low_incomes = np.random.uniform(MIN_INCOME / 2, FRAUD_LOW_INCOME_MAX, size=len(fraud_ids_for_pattern))
tax_df.loc[tax_df['Taxpayer ID'].isin(fraud_ids_for_pattern), 'Declared Income'] = np.round(fraud_low_incomes, 2)
# Optionally adjust deductions based on the new low income
tax_df.loc[tax_df['Taxpayer ID'].isin(fraud_ids_for_pattern), 'Deductions'] = tax_df['Declared Income'] * np.random.normal(loc=DEDUCTION_RATE_MEAN, scale=DEDUCTION_RATE_STD*0.5) # Lower variation for fraud cases maybe?
tax_df['Deductions'] = tax_df['Deductions'].clip(lower=0) # Ensure non-negative deductions

print(f"Modified 'Declared Income' for {len(fraud_ids_for_pattern)} IDs to be below ${FRAUD_LOW_INCOME_MAX:.2f}.")

# --- Add Missing Values ---
numeric_cols_tax = ['Declared Income', 'Deductions']
categorical_cols_tax = ['Sector']

for col in numeric_cols_tax + categorical_cols_tax:
    mask = np.random.rand(len(tax_df)) < MISSING_VALUE_RATE_TAX
    tax_df.loc[mask, col] = np.nan

print(f"Introduced ~{MISSING_VALUE_RATE_TAX*100:.1f}% missing values into tax data (excluding ID).")

print("Sample Tax Filings Data (with potential fraud pattern):")
print(tax_df.head())
# Check one fraud ID
if len(fraud_ids_for_pattern) > 0:
  print("\nExample Tax Record for a 'Fraud Pattern' ID:")
  print(tax_df[tax_df['Taxpayer ID'] == fraud_ids_for_pattern[0]])
print("-" * 50)

# =============================================================================
# 6. Generate Synthetic Property Ownership Data
# =============================================================================
print("[6. Generating Synthetic Property Ownership Data]")

# Select subset of taxpayers who own property
num_owners = int(NUM_TAXPAYERS * PROPERTY_OWNERSHIP_RATE)
owner_ids_base = np.random.choice(taxpayer_ids, size=num_owners, replace=False)

# Ensure the fraud pattern IDs are included among property owners
owner_ids_final = np.unique(np.concatenate((owner_ids_base, fraud_ids_for_pattern)))
print(f"Total unique property owners (including fraud pattern IDs): {len(owner_ids_final)}")

property_data = []
current_date = datetime.now()

# Generate properties, allowing multiple per owner
num_properties_total = int(len(owner_ids_final) * AVG_PROPERTIES_PER_OWNER)
owner_ids_for_props = np.random.choice(owner_ids_final, size=num_properties_total, replace=True) # Allow multiple properties

for owner_id in owner_ids_for_props:
    # Determine property value based on whether it's a fraud pattern ID
    if owner_id in fraud_ids_for_pattern:
        # Assign high property value
        prop_value = np.random.uniform(FRAUD_HIGH_PROP_VALUE_MIN, MAX_PROPERTY_VALUE * 1.2) # Allow slightly above max for fraud cases
    else:
        # Assign "normal" property value
        prop_value = np.random.lognormal(mean=PROP_VALUE_LOG_MEAN, sigma=PROP_VALUE_LOG_STD)
        prop_value = max(MIN_PROPERTY_VALUE, min(prop_value, MAX_PROPERTY_VALUE)) # Clamp

    location = np.random.choice(LOCATIONS, p=LOCATION_PROBS)

    # Generate ownership date within the last N years
    days_ago = random.randint(1, MAX_OWNERSHIP_YEARS_AGO * 365)
    ownership_date = current_date - timedelta(days=days_ago)

    property_data.append({
        'Taxpayer ID': owner_id,
        'Property Value': round(prop_value, 2),
        'Location': location,
        'Ownership Date': ownership_date.strftime('%Y-%m-%d') # Format as string YYYY-MM-DD
    })

property_df = pd.DataFrame(property_data)
print(f"Generated {len(property_df)} property records for {property_df['Taxpayer ID'].nunique()} unique owners.")


# --- Add Missing Values ---
numeric_cols_prop = ['Property Value']
categorical_cols_prop = ['Location']
date_cols_prop = ['Ownership Date'] # Even though it's string, treat conceptually as date

for col in numeric_cols_prop + categorical_cols_prop + date_cols_prop:
    mask = np.random.rand(len(property_df)) < MISSING_VALUE_RATE_PROP
    property_df.loc[mask, col] = np.nan

print(f"Introduced ~{MISSING_VALUE_RATE_PROP*100:.1f}% missing values into property data (excluding ID).")

print("\nSample Property Ownership Data:")
print(property_df.head())
# Check one fraud ID
if len(fraud_ids_for_pattern) > 0:
    print("\nExample Property Record(s) for the same 'Fraud Pattern' ID:")
    print(property_df[property_df['Taxpayer ID'] == fraud_ids_for_pattern[0]])
print("-" * 50)


# =============================================================================
# 7. Generate Synthetic Company Directorships Data (Optional)
# =============================================================================
print("[7. Generating Synthetic Company Directorships Data (Optional)]")

if GENERATE_DIRECTORSHIPS:
    # Generate unique Company IDs
    company_ids = [f"COMP_{uuid.uuid4().hex[:8].upper()}" for _ in range(NUM_COMPANIES)]

    # Select subset of taxpayers who are directors
    num_directors = int(NUM_TAXPAYERS * DIRECTORSHIP_RATE)
    director_ids_base = np.random.choice(taxpayer_ids, size=num_directors, replace=False)

    # Optionally, ensure some fraud pattern IDs are directors (or make them less likely?)
    # For this example, we keep it random relative to the base population.
    # director_ids_final = np.unique(np.concatenate((director_ids_base, subset_of_fraud_ids)))
    director_ids_final = director_ids_base
    print(f"Selected {len(director_ids_final)} unique taxpayers as potential directors.")


    company_data = []
    num_directorships_total = int(len(director_ids_final) * AVG_DIRECTORSHIPS_PER_DIRECTOR)
    director_ids_for_roles = np.random.choice(director_ids_final, size=num_directorships_total, replace=True) # Allow multiple roles

    for director_id in director_ids_for_roles:
        company_id = random.choice(company_ids)
        role = np.random.choice(ROLES, p=ROLE_PROBS)

        company_data.append({
            'Taxpayer ID': director_id,
            'Company ID': company_id,
            'Director Role': role
        })

    company_df = pd.DataFrame(company_data)
    # Remove exact duplicates (same person, same company, same role)
    company_df = company_df.drop_duplicates()
    print(f"Generated {len(company_df)} directorship records for {company_df['Taxpayer ID'].nunique()} unique directors across {company_df['Company ID'].nunique()} companies.")

    # --- Add Missing Values ---
    categorical_cols_comp = ['Director Role'] # Company ID assumed mandatory

    for col in categorical_cols_comp:
      mask = np.random.rand(len(company_df)) < MISSING_VALUE_RATE_COMP
      company_df.loc[mask, col] = np.nan

    print(f"Introduced ~{MISSING_VALUE_RATE_COMP*100:.1f}% missing values into company data (excluding ID).")

    print("\nSample Company Directorship Data:")
    print(company_df.head())
    # Check one fraud ID if they happen to be a director
    if len(fraud_ids_for_pattern) > 0:
        print("\nExample Directorship Record(s) for the same 'Fraud Pattern' ID (if they exist):")
        print(company_df[company_df['Taxpayer ID'] == fraud_ids_for_pattern[0]])

else:
    print("Skipping generation of Company Directorships data as GENERATE_DIRECTORSHIPS is False.")
    company_df = None

print("-" * 50)


# =============================================================================
# 8. Save Generated Data to CSV
# =============================================================================
print("[8. Saving Generated Data to CSV]")

try:
    tax_file_path = os.path.join(OUTPUT_DIR, 'synthetic_tax_filings.csv')
    tax_df.to_csv(tax_file_path, index=False)
    print(f"Successfully saved tax data to: {tax_file_path}")

    property_file_path = os.path.join(OUTPUT_DIR, 'synthetic_property_ownership.csv')
    property_df.to_csv(property_file_path, index=False)
    print(f"Successfully saved property data to: {property_file_path}")

    if company_df is not None:
        company_file_path = os.path.join(OUTPUT_DIR, 'synthetic_company_directorships.csv')
        company_df.to_csv(company_file_path, index=False)
        print(f"Successfully saved company directorship data to: {company_file_path}")

except Exception as e:
    print(f"ERROR saving data files: {e}")

print("-" * 50)

# =============================================================================
# 9. Verification Step (Optional)
# =============================================================================
print("[9. Verification Step (Optional)]")

print("Quick check of generated files:")
try:
    print(f"\nTax File ({tax_file_path}):")
    df_check = pd.read_csv(tax_file_path)
    print(df_check.info())
    print(df_check.head())

    print(f"\nProperty File ({property_file_path}):")
    df_check = pd.read_csv(property_file_path)
    print(df_check.info())
    print(df_check.head())

    if company_df is not None:
        print(f"\nCompany File ({company_file_path}):")
        df_check = pd.read_csv(company_file_path)
        print(df_check.info())
        print(df_check.head())

    # Verify a known fraud pattern ID
    if len(fraud_ids_for_pattern) > 0:
        verify_id = fraud_ids_for_pattern[0]
        print(f"\nVerifying cross-source pattern for ID: {verify_id}")
        tax_record = pd.read_csv(tax_file_path)
        tax_record = tax_record[tax_record['Taxpayer ID'] == verify_id]
        print("Tax Record:")
        print(tax_record[['Taxpayer ID', 'Declared Income']])

        prop_record = pd.read_csv(property_file_path)
        prop_record = prop_record[prop_record['Taxpayer ID'] == verify_id]
        print("\nProperty Record(s):")
        print(prop_record[['Taxpayer ID', 'Property Value']])
        # Check if the expected pattern holds (low income, high property value)
        is_low_income = tax_record['Declared Income'].iloc[0] <= FRAUD_LOW_INCOME_MAX if not tax_record.empty else False
        has_high_prop = prop_record['Property Value'].max() >= FRAUD_HIGH_PROP_VALUE_MIN if not prop_record.empty else False
        print(f"\nPattern Check -> Low Income: {is_low_income}, High Property Value: {has_high_prop}")


except Exception as e:
    print(f"Error during verification read: {e}")

print("-" * 50)


# =============================================================================
# 10. Conclusion
# =============================================================================
print("[10. Conclusion]")
print("Notebook 00 finished.")
print("Environment setup guidance provided.")
print("Synthetic data files generated and saved:")
print(f"  - {os.path.join(OUTPUT_DIR, 'synthetic_tax_filings.csv')}")
print(f"  - {os.path.join(OUTPUT_DIR, 'synthetic_property_ownership.csv')}")
if company_df is not None:
    print(f"  - {os.path.join(OUTPUT_DIR, 'synthetic_company_directorships.csv')}")
print("Data includes embedded cross-source patterns for selected 'fraudulent' profiles.")
print("\nReady to proceed to Notebook 01: Source Data Exploration & Initial Cleaning.")

[1. Environment Setup Instructions]
Please ensure you have followed the environment setup steps above.
Make sure the required libraries (pandas, numpy, etc.) are installed.
--------------------------------------------------
[2. Importing Libraries]
Libraries imported successfully.
--------------------------------------------------
[3. Configuration for Data Generation]
Configuration set:
  NUM_TAXPAYERS: 5000
  OUTPUT_DIR: ./data
  SEED: 42
  GENERATE_DIRECTORSHIPS: True
  FRAUD_PATTERN_RATIO: 0.03
--------------------------------------------------
[4. Generating Base Taxpayer IDs]
Generated 5000 unique Taxpayer IDs.
--------------------------------------------------
[5. Generating Synthetic Tax Filings Data]
Generated initial tax filings data for 4750 taxpayers.
Selected 150 Taxpayer IDs to embed cross-source fraud pattern.
Modified 'Declared Income' for 150 IDs to be below $20000.00.
Introduced ~3.0% missing values into tax data (excluding ID).
Sample Tax Filings Data (with potential