# 📈 Private Equity Synthetic Data Generator v1.0
## Leveraging Snowflake Native Synthetic Data Generation

### Overview
This notebook demonstrates how to generate large-scale synthetic private equity datasets using Snowflake's native `GENERATE_SYNTHETIC_DATA`. It models typical PE workflows:

- Firms, funds, LPs, portfolio companies
- Capital calls, distributions, valuations
- Deals and investment performance

### Data Architecture
```
📊 SEED DATA (Manual) → 🤖 SYNTHETIC DATA (Snowflake AI)
├── PE_FIRMS (10) → (10)
├── PE_FUNDS (25) → (250+)
├── LIMITED_PARTNERS (200) → (5,000+)
├── PORTFOLIO_COMPANIES (150) → (10,000+)
├── INVESTMENTS (250) → (25,000+)
├── CAPITAL_CALLS (500) → (500,000+)
└── DISTRIBUTIONS (500) → (500,000+)
```

### Requirements
- Snowflake Enterprise Edition or higher
- Medium Snowpark-optimized warehouse
- Anaconda terms accepted


In [None]:
# 📦 SETUP AND CONFIGURATION (Private Equity)

import pandas as pd
import numpy as np
import random
import string
import json
import datetime as dt
from datetime import timedelta
from typing import List, Dict, Any, Optional

from snowflake.snowpark import Session, functions as F
from snowflake.snowpark.types import *

session = get_active_session()

random.seed(42)
np.random.seed(42)

CONFIG = {
    'database': 'PE_SYNTH_DB',
    'schema': 'SEED_DATA',
    'synth_schema': 'SYNTHETIC_DATA',
    'warehouse': None,

    'seed_firms': 10,
    'seed_funds': 25,
    'seed_lps': 200,
    'seed_companies': 150,
    'seed_investments': 250,
    'seed_calls': 500,
    'seed_distributions': 500,

    'target_lps': 5000,
    'target_companies': 10000,

    'enable_privacy_filter': True,
    'replace_output_tables': True
}

print("🚀 Private Equity Synthetic Data Generator v1.0")
print(f"📊 Database: {CONFIG['database']}")
print(f"🌱 Seed: {CONFIG['schema']}  🤖 Synth: {CONFIG['synth_schema']}")
print("✅ Using Snowflake native GENERATE_SYNTHETIC_DATA")


In [None]:
# 🏗️ ENVIRONMENT SETUP (PE)

def setup_database_environment():
    print("🏗️ Setting up Snowflake environment...")
    try:
        current_wh = session.sql("SELECT CURRENT_WAREHOUSE()").collect()[0][0]
        if current_wh:
            print(f"   ✅ Using warehouse: {current_wh}")
            CONFIG['warehouse'] = current_wh
        else:
            warehouses = session.sql("SHOW WAREHOUSES").collect()
            if warehouses:
                wh_name = warehouses[0]['name']
                session.sql(f"USE WAREHOUSE {wh_name}").collect()
                CONFIG['warehouse'] = wh_name
                print(f"   🔄 Switched to warehouse: {wh_name}")
            else:
                raise Exception("No warehouses available")
        print(f"   🏗️ Creating database: {CONFIG['database']}")
        session.sql(f"CREATE DATABASE IF NOT EXISTS {CONFIG['database']}").collect()
        session.sql(f"USE DATABASE {CONFIG['database']}").collect()
        print(f"   📁 Creating schemas...")
        session.sql(f"CREATE SCHEMA IF NOT EXISTS {CONFIG['schema']}").collect()
        session.sql(f"CREATE SCHEMA IF NOT EXISTS {CONFIG['synth_schema']}").collect()
        session.sql(f"USE SCHEMA {CONFIG['schema']}").collect()
        current_db = session.sql("SELECT CURRENT_DATABASE()").collect()[0][0]
        current_schema = session.sql("SELECT CURRENT_SCHEMA()").collect()[0][0]
        current_wh = session.sql("SELECT CURRENT_WAREHOUSE()").collect()[0][0]
        print("✅ Environment ready:")
        print(f"   📋 Database: {current_db}")
        print(f"   📋 Active Schema: {current_schema}")
        print(f"   📋 Warehouse: {current_wh}")
        return True
    except Exception as e:
        print(f"❌ Environment setup failed: {e}")
        return False

if setup_database_environment():
    print("🎯 Ready to create seed data!")
else:
    print("💥 Cannot proceed without proper environment setup")


In [None]:
# 🗃️ CREATE SEED DATA TABLES (Private Equity)

def create_seed_tables():
    print("🗃️ Creating PE seed data table schemas...")
    tables = ['PE_FIRMS','PE_FUNDS','LIMITED_PARTNERS','PORTFOLIO_COMPANIES','INVESTMENTS','CAPITAL_CALLS','DISTRIBUTIONS','VALUATIONS']
    for t in tables:
        session.sql(f"DROP TABLE IF EXISTS {t}").collect()

    session.sql("""
        CREATE TABLE PE_FIRMS (
            FIRM_ID STRING PRIMARY KEY,
            FIRM_NAME STRING NOT NULL,
            HQ_CITY STRING,
            HQ_COUNTRY STRING,
            FOUNDED_YEAR INTEGER,
            STRATEGY STRING, -- Buyout, Growth, Venture, Secondaries
            AUM_BN DECIMAL(10,2),
            CREATED_DATE DATE DEFAULT CURRENT_DATE()
        )
    """).collect()

    session.sql("""
        CREATE TABLE PE_FUNDS (
            FUND_ID STRING PRIMARY KEY,
            FIRM_ID STRING NOT NULL,
            FUND_NAME STRING NOT NULL,
            VINTAGE_YEAR INTEGER,
            FUND_SIZE_BN DECIMAL(10,2),
            STRATEGY STRING,
            STATUS STRING,
            CREATED_DATE DATE DEFAULT CURRENT_DATE(),
            FOREIGN KEY (FIRM_ID) REFERENCES PE_FIRMS(FIRM_ID)
        )
    """).collect()

    session.sql("""
        CREATE TABLE LIMITED_PARTNERS (
            LP_ID STRING PRIMARY KEY,
            LP_NAME STRING NOT NULL,
            LP_TYPE STRING, -- Pension, Endowment, Family Office, Sovereign Wealth, Insurance
            COUNTRY STRING,
            COMMITMENT_CAPACITY_MN DECIMAL(12,2),
            CONTACT_EMAIL STRING,
            CREATED_DATE DATE DEFAULT CURRENT_DATE()
        )
    """).collect()

    session.sql("""
        CREATE TABLE PORTFOLIO_COMPANIES (
            COMPANY_ID STRING PRIMARY KEY,
            COMPANY_NAME STRING NOT NULL,
            INDUSTRY STRING,
            COUNTRY STRING,
            REVENUE_MN DECIMAL(12,2),
            EBITDA_MN DECIMAL(12,2),
            EMPLOYEES INTEGER,
            CREATED_DATE DATE DEFAULT CURRENT_DATE()
        )
    """).collect()

    session.sql("""
        CREATE TABLE INVESTMENTS (
            INVESTMENT_ID STRING PRIMARY KEY,
            FUND_ID STRING NOT NULL,
            COMPANY_ID STRING NOT NULL,
            DEAL_DATE DATE,
            DEAL_TYPE STRING, -- Majority, Minority, Venture, Secondary
            EQUITY_INVESTED_MN DECIMAL(12,2),
            OWNERSHIP_PCT DECIMAL(5,2),
            CREATED_DATE DATE DEFAULT CURRENT_DATE(),
            FOREIGN KEY (FUND_ID) REFERENCES PE_FUNDS(FUND_ID),
            FOREIGN KEY (COMPANY_ID) REFERENCES PORTFOLIO_COMPANIES(COMPANY_ID)
        )
    """).collect()

    session.sql("""
        CREATE TABLE CAPITAL_CALLS (
            CALL_ID STRING PRIMARY KEY,
            FUND_ID STRING NOT NULL,
            LP_ID STRING NOT NULL,
            CALL_DATE DATE NOT NULL,
            AMOUNT_MN DECIMAL(12,2) NOT NULL,
            CREATED_DATE DATE DEFAULT CURRENT_DATE(),
            FOREIGN KEY (FUND_ID) REFERENCES PE_FUNDS(FUND_ID),
            FOREIGN KEY (LP_ID) REFERENCES LIMITED_PARTNERS(LP_ID)
        )
    """).collect()

    session.sql("""
        CREATE TABLE DISTRIBUTIONS (
            DIST_ID STRING PRIMARY KEY,
            FUND_ID STRING NOT NULL,
            LP_ID STRING NOT NULL,
            DIST_DATE DATE NOT NULL,
            AMOUNT_MN DECIMAL(12,2) NOT NULL,
            CREATED_DATE DATE DEFAULT CURRENT_DATE(),
            FOREIGN KEY (FUND_ID) REFERENCES PE_FUNDS(FUND_ID),
            FOREIGN KEY (LP_ID) REFERENCES LIMITED_PARTNERS(LP_ID)
        )
    """).collect()

    session.sql("""
        CREATE TABLE VALUATIONS (
            VALUATION_ID STRING PRIMARY KEY,
            INVESTMENT_ID STRING NOT NULL,
            AS_OF_DATE DATE NOT NULL,
            EV_MN DECIMAL(12,2),
            NET_DEBT_MN DECIMAL(12,2),
            CREATED_DATE DATE DEFAULT CURRENT_DATE(),
            FOREIGN KEY (INVESTMENT_ID) REFERENCES INVESTMENTS(INVESTMENT_ID)
        )
    """).collect()

    print("✅ PE seed table schemas created")
    print(f"   📊 Created {len(tables)} tables")

create_seed_tables()


In [None]:
# 🌱 POPULATE SEED DATA (Private Equity)

class SeedDataGenerator:
    def __init__(self):
        self.firm_strategies = ['Buyout', 'Growth', 'Venture', 'Secondaries']
        self.industries = ['Software','Healthcare','Industrial','Consumer','Financials','Energy','TMT']
        self.lp_types = ['Pension','Endowment','Family Office','Sovereign Wealth','Insurance']
        self.countries = ['USA','UK','Germany','France','Canada','Japan','Singapore']
        self.cities = ['New York','London','San Francisco','Boston','Chicago','Paris','Berlin','Toronto','Tokyo','Singapore']
        self.first_names = ['James','Mary','John','Patricia','Robert','Jennifer','Michael','Linda','William','Elizabeth']
        self.last_names = ['Smith','Johnson','Williams','Brown','Jones','Garcia','Miller','Davis','Rodriguez','Martinez']
    
    def generate_id(self, prefix: str, counter: int) -> str:
        return f"{prefix}{counter:06d}"

seed_gen = SeedDataGenerator()
print("✅ PE seed data generator initialized")


def create_firms_seed():
    data = []
    for i in range(1, CONFIG['seed_firms'] + 1):
        data.append({
            'FIRM_ID': seed_gen.generate_id('FIRM', i),
            'FIRM_NAME': f"{random.choice(['Summit','Harbor','Pinnacle','Crest','Aurora','Atlas'])} Capital {i}",
            'HQ_CITY': random.choice(seed_gen.cities),
            'HQ_COUNTRY': random.choice(seed_gen.countries),
            'FOUNDED_YEAR': random.randint(1980, 2020),
            'STRATEGY': random.choice(seed_gen.firm_strategies),
            'AUM_BN': round(random.uniform(1.0, 50.0), 2)
        })
    df = pd.DataFrame(data)
    session.write_pandas(df, 'PE_FIRMS', auto_create_table=False, overwrite=True)
    print(f"   ✅ Created {len(df)} firms")
    return df


def create_funds_seed():
    firms_df = session.table('PE_FIRMS').to_pandas()
    data = []
    counter = 1
    for _, firm in firms_df.iterrows():
        for _ in range(max(1, CONFIG['seed_funds'] // len(firms_df))):
            data.append({
                'FUND_ID': seed_gen.generate_id('FUND', counter),
                'FIRM_ID': firm['FIRM_ID'],
                'FUND_NAME': f"{firm['FIRM_NAME']} Fund {random.randint(1,5)}",
                'VINTAGE_YEAR': random.randint(2005, 2024),
                'FUND_SIZE_BN': round(random.uniform(0.5, 10.0), 2),
                'STRATEGY': firm['STRATEGY'],
                'STATUS': random.choice(['Investing','Harvesting','Closed'])
            })
            counter += 1
    df = pd.DataFrame(data)
    session.write_pandas(df, 'PE_FUNDS', auto_create_table=False, overwrite=True)
    print(f"   ✅ Created {len(df)} funds")
    return df


def create_lps_seed():
    data = []
    for i in range(1, CONFIG['seed_lps'] + 1):
        data.append({
            'LP_ID': seed_gen.generate_id('LP', i),
            'LP_NAME': f"{random.choice(['Alpha','Omega','Liberty','Heritage','Frontier'])} {random.choice(['Pension','Endowment','Capital','Holdings'])} {i}",
            'LP_TYPE': random.choice(seed_gen.lp_types),
            'COUNTRY': random.choice(seed_gen.countries),
            'COMMITMENT_CAPACITY_MN': round(random.uniform(50.0, 2000.0), 2),
            'CONTACT_EMAIL': f"lp{i}@example.com"
        })
    df = pd.DataFrame(data)
    session.write_pandas(df, 'LIMITED_PARTNERS', auto_create_table=False, overwrite=True)
    print(f"   ✅ Created {len(df)} LPs")
    return df


def create_companies_seed():
    data = []
    for i in range(1, CONFIG['seed_companies'] + 1):
        rev = round(random.uniform(10.0, 2000.0), 2)
        ebitda = round(rev * random.uniform(0.05, 0.35), 2)
        data.append({
            'COMPANY_ID': seed_gen.generate_id('CO', i),
            'COMPANY_NAME': f"{random.choice(['Blue','Green','Silver','Quantum','Vertex'])} {random.choice(['Systems','Health','Industries','Foods','Tech'])} {i}",
            'INDUSTRY': random.choice(seed_gen.industries),
            'COUNTRY': random.choice(seed_gen.countries),
            'REVENUE_MN': rev,
            'EBITDA_MN': ebitda,
            'EMPLOYEES': random.randint(50, 20000)
        })
    df = pd.DataFrame(data)
    session.write_pandas(df, 'PORTFOLIO_COMPANIES', auto_create_table=False, overwrite=True)
    print(f"   ✅ Created {len(df)} portfolio companies")
    return df


def create_investments_seed():
    funds_df = session.table('PE_FUNDS').to_pandas()
    companies_df = session.table('PORTFOLIO_COMPANIES').to_pandas()
    data = []
    counter = 1
    for _, fund in funds_df.iterrows():
        for _ in range(max(1, CONFIG['seed_investments'] // len(funds_df))):
            co = companies_df.sample(1).iloc[0]
            equity = round(random.uniform(10.0, 1000.0), 2)
            data.append({
                'INVESTMENT_ID': seed_gen.generate_id('INV', counter),
                'FUND_ID': fund['FUND_ID'],
                'COMPANY_ID': co['COMPANY_ID'],
                'DEAL_DATE': dt.date(random.randint(2005, 2024), random.randint(1, 12), random.randint(1, 28)),
                'DEAL_TYPE': random.choice(['Majority','Minority','Venture','Secondary']),
                'EQUITY_INVESTED_MN': equity,
                'OWNERSHIP_PCT': round(random.uniform(5.0, 90.0), 2)
            })
            counter += 1
    df = pd.DataFrame(data)
    session.write_pandas(df, 'INVESTMENTS', auto_create_table=False, overwrite=True)
    print(f"   ✅ Created {len(df)} investments")
    return df


# Create PE seeds
firms_df = create_firms_seed()
funds_df = create_funds_seed()
lps_df = create_lps_seed()
companies_df = create_companies_seed()
investments_df = create_investments_seed()

print(f"🎯 PE seed complete: {len(firms_df)} firms, {len(funds_df)} funds, {len(lps_df)} LPs, {len(companies_df)} cos, {len(investments_df)} deals")


In [None]:
# 🤖 SNOWFLAKE SYNTHETIC DATA GENERATION (PE)

def generate_synthetic_data():
    print("🤖 Generating synthetic data for PE core entities...")
    try:
        session.sql("""
            CREATE OR REPLACE SECRET PE_CONSISTENCY_SECRET
            TYPE = SYMMETRIC_KEY
            ALGORITHM = GENERIC
        """).collect()

        # LPs
        session.sql(f"""
            CALL SNOWFLAKE.DATA_PRIVACY.GENERATE_SYNTHETIC_DATA({{
                'datasets': [{{
                    'input_table': '{CONFIG['database']}.{CONFIG['schema']}.LIMITED_PARTNERS',
                    'output_table': '{CONFIG['database']}.{CONFIG['synth_schema']}.LIMITED_PARTNERS_SYNTHETIC',
                    'columns': {{ 'LP_ID': {{'join_key': true}} }}
                }}],
                'consistency_secret': SYSTEM$REFERENCE('SECRET', 'PE_CONSISTENCY_SECRET', 'SESSION', 'READ')::STRING,
                'replace_output_tables': {str(CONFIG['replace_output_tables']).lower()},
                'similarity_filter': {str(CONFIG['enable_privacy_filter']).lower()}
            }});
        """).collect()

        # Funds
        session.sql(f"""
            CALL SNOWFLAKE.DATA_PRIVACY.GENERATE_SYNTHETIC_DATA({{
                'datasets': [{{
                    'input_table': '{CONFIG['database']}.{CONFIG['schema']}.PE_FUNDS',
                    'output_table': '{CONFIG['database']}.{CONFIG['synth_schema']}.PE_FUNDS_SYNTHETIC',
                    'columns': {{ 'FUND_ID': {{'join_key': true}}, 'FIRM_ID': {{'join_key': true}} }}
                }}],
                'consistency_secret': SYSTEM$REFERENCE('SECRET', 'PE_CONSISTENCY_SECRET', 'SESSION', 'READ')::STRING,
                'replace_output_tables': {str(CONFIG['replace_output_tables']).lower()},
                'similarity_filter': {str(CONFIG['enable_privacy_filter']).lower()}
            }});
        """).collect()

        # Companies
        session.sql(f"""
            CALL SNOWFLAKE.DATA_PRIVACY.GENERATE_SYNTHETIC_DATA({{
                'datasets': [{{
                    'input_table': '{CONFIG['database']}.{CONFIG['schema']}.PORTFOLIO_COMPANIES',
                    'output_table': '{CONFIG['database']}.{CONFIG['synth_schema']}.PORTFOLIO_COMPANIES_SYNTHETIC',
                    'columns': {{ 'COMPANY_ID': {{'join_key': true}} }}
                }}],
                'consistency_secret': SYSTEM$REFERENCE('SECRET', 'PE_CONSISTENCY_SECRET', 'SESSION', 'READ')::STRING,
                'replace_output_tables': {str(CONFIG['replace_output_tables']).lower()},
                'similarity_filter': {str(CONFIG['enable_privacy_filter']).lower()}
            }});
        """).collect()

        lps = session.sql(f"SELECT COUNT(*) FROM {CONFIG['database']}.{CONFIG['synth_schema']}.LIMITED_PARTNERS_SYNTHETIC").collect()[0][0]
        funds = session.sql(f"SELECT COUNT(*) FROM {CONFIG['database']}.{CONFIG['synth_schema']}.PE_FUNDS_SYNTHETIC").collect()[0][0]
        cos = session.sql(f"SELECT COUNT(*) FROM {CONFIG['database']}.{CONFIG['synth_schema']}.PORTFOLIO_COMPANIES_SYNTHETIC").collect()[0][0]
        print(f"✅ Synthetic LPs: {lps:,}  Funds: {funds:,}  Companies: {cos:,}")
        return True
    except Exception as e:
        print(f"❌ Synthetic data generation failed: {e}")
        return False

print("🌱 Ready to generate synthetic PE data...")
generate_synthetic_data()


In [None]:
# 🚀 MULTI-RUN SCALING (PE)

def scale_companies():
    print("🚀 Scaling portfolio companies via batch runs...")
    try:
        seed_size = session.sql(f"SELECT COUNT(*) FROM {CONFIG['database']}.{CONFIG['schema']}.PORTFOLIO_COMPANIES").collect()[0][0]
        current = session.sql(f"SELECT COUNT(*) FROM {CONFIG['database']}.{CONFIG['synth_schema']}.PORTFOLIO_COMPANIES_SYNTHETIC").collect()[0][0]
        needed = max(0, CONFIG['target_companies'] - current)
        if needed <= 0:
            print("   ✅ Target already met")
            return True
        iterations = min(10, (needed // max(1, seed_size)) + 1)
        for i in range(1, iterations + 1):
            session.sql(f"""
                CALL SNOWFLAKE.DATA_PRIVACY.GENERATE_SYNTHETIC_DATA({{
                    'datasets': [{{
                        'input_table': '{CONFIG['database']}.{CONFIG['schema']}.PORTFOLIO_COMPANIES',
                        'output_table': '{CONFIG['database']}.{CONFIG['synth_schema']}.PORTFOLIO_COMPANIES_SYNTHETIC_BATCH_{i}',
                        'columns': {{ 'COMPANY_ID': {{'join_key': true}} }}
                    }}],
                    'consistency_secret': SYSTEM$REFERENCE('SECRET', 'PE_CONSISTENCY_SECRET', 'SESSION', 'READ')::STRING,
                    'replace_output_tables': true,
                    'similarity_filter': false
                }});
            """).collect()
            session.sql(f"""
                UPDATE {CONFIG['database']}.{CONFIG['synth_schema']}.PORTFOLIO_COMPANIES_SYNTHETIC_BATCH_{i}
                SET COMPANY_ID = CONCAT('CO', LPAD((ROW_NUMBER() OVER (ORDER BY COMPANY_ID) + {(i-1) * seed_size})::STRING, 6, '0'))
            """).collect()
        session.sql(f"DROP TABLE IF EXISTS {CONFIG['database']}.{CONFIG['synth_schema']}.PORTFOLIO_COMPANIES_SYNTHETIC").collect()
        session.sql(f"""
            CREATE TABLE {CONFIG['database']}.{CONFIG['synth_schema']}.PORTFOLIO_COMPANIES_SYNTHETIC AS
            SELECT * FROM {CONFIG['database']}.{CONFIG['synth_schema']}.PORTFOLIO_COMPANIES_SYNTHETIC_BATCH_1
        """).collect()
        batches = session.sql(f"SHOW TABLES LIKE 'PORTFOLIO_COMPANIES_SYNTHETIC_BATCH_%' IN SCHEMA {CONFIG['database']}.{CONFIG['synth_schema']}").collect()
        for b in batches[1:]:
            name = b['name']
            session.sql(f"INSERT INTO {CONFIG['database']}.{CONFIG['synth_schema']}.PORTFOLIO_COMPANIES_SYNTHETIC SELECT * FROM {CONFIG['database']}.{CONFIG['synth_schema']}.{name}").collect()
        for b in batches:
            name = b['name']
            session.sql(f"DROP TABLE {CONFIG['database']}.{CONFIG['synth_schema']}.{name}").collect()
        final = session.sql(f"SELECT COUNT(*) FROM {CONFIG['database']}.{CONFIG['synth_schema']}.PORTFOLIO_COMPANIES_SYNTHETIC").collect()[0][0]
        print(f"📈 Final companies: {final:,}")
        return True
    except Exception as e:
        print(f"❌ Scaling failed: {e}")
        return False

scale_companies()


## 🎯 Private Equity Synthetic Data Generator v1.0 - Complete!

### 🏆 What We've Built

Synthetic PE data using Snowflake `GENERATE_SYNTHETIC_DATA` for:
- LPs, funds, portfolio companies
- Join-key consistency and privacy filters
- Optional scaling for portfolio companies

### 📊 Check Counts Printed During Generation
- Limited Partners
- Funds
- Portfolio Companies

### 🔮 Next Steps
1. Add synthetic generation for `CAPITAL_CALLS`, `DISTRIBUTIONS`, and `VALUATIONS`
2. Model cashflow waterfalls and DPI/TVPI/IRR metrics
3. Extend to quarterly valuations with sector-specific patterns

### 🚀 Usage
1. Run setup/environment
2. Run seed creation cells
3. Run synthetic generation
4. Optionally run scaling
