# Financial Services ML Pipeline - Data Generation
## Running in Snowflake Notebooks

This notebook generates synthetic financial services data directly in Snowflake using Snowpark and SQL.

## What We'll Build
- 50,000 realistic client profiles
- 7.5M marketing events across multiple channels
- Advisor directory and relationship data

## Prerequisites
- Snowflake account with Snowpark enabled
- Database and schema created (run snowflake_setup.sql first)
- ML_WAREHOUSE available


In [None]:
# Import required libraries
import snowflake.snowpark as snowpark
from snowflake.snowpark import Session
from snowflake.snowpark.functions import *
from snowflake.snowpark.types import *
import pandas as pd
import numpy as np

# Get the active Snowflake session
session = snowpark.session._get_active_session()

print(f"Connected to Snowflake!")
print(f"Current database: {session.get_current_database()}")
print(f"Current schema: {session.get_current_schema()}")
print(f"Current warehouse: {session.get_current_warehouse()}")


## Step 1: Generate Client Demographics


In [None]:
# Generate 50,000 synthetic clients using Snowflake SQL
clients_sql = """
WITH client_demographics AS (
  SELECT 
    'client_' || LPAD(seq8(), 8, '0') as client_id,
    CURRENT_TIMESTAMP() as created_date,
    
    -- Demographics with realistic distributions
    ROUND(NORMAL(45, 12, RANDOM()) + 0.5) as age_raw,
    CASE 
      WHEN age_raw < 25 THEN 25
      WHEN age_raw > 70 THEN 70
      ELSE age_raw
    END as age,
    
    CASE 
      WHEN UNIFORM(0, 1, RANDOM()) < 0.48 THEN 'M'
      WHEN UNIFORM(0, 1, RANDOM()) < 0.96 THEN 'F' 
      ELSE 'Other'
    END as gender,
    
    -- Income with log-normal distribution
    ROUND(EXP(NORMAL(10.8, 0.5, RANDOM()))) as annual_income_raw,
    CASE 
      WHEN annual_income_raw < 25000 THEN 25000
      WHEN annual_income_raw > 500000 THEN 500000
      ELSE annual_income_raw
    END as annual_income
    
  FROM TABLE(GENERATOR(ROWCOUNT => 50000))
)

SELECT 
  client_id,
  created_date,
  age,
  gender,
  'Married' as marital_status,
  'Bachelor' as education_level,
  'Employed' as employment_status,
  'Software Engineer' as occupation,
  annual_income,
  'CA' as state,
  'San Francisco' as city,
  '94105' as zip_code,
  -- Calculate 401k balance based on age and income
  ROUND((age - 25) * annual_income * 0.08 * POWER(1.07, (age - 25) / 2)) as current_401k_balance,
  GREATEST(0, 65 - age) as years_to_retirement,
  'Moderate' as risk_tolerance,
  'Intermediate' as investment_experience,
  ARRAY_CONSTRUCT('Retirement Planning', 'Wealth Building') as financial_goals,
  UNIFORM(1, 60, RANDOM()) as client_tenure_months,
  'ADV_' || LPAD(UNIFORM(1, 50, RANDOM()), 3, '0') as assigned_advisor_id,
  CASE 
    WHEN UNIFORM(0, 1, RANDOM()) < 0.60 THEN 'Basic'
    WHEN UNIFORM(0, 1, RANDOM()) < 0.90 THEN 'Premium'
    ELSE 'Elite'
  END as service_tier,
  ROUND(current_401k_balance * UNIFORM(1.0, 3.5, RANDOM())) as total_assets_under_management,
  'Email' as preferred_contact_method,
  DATEADD(day, -UNIFORM(1, 90, RANDOM()), CURRENT_DATE()) as last_contact_date,
  'Monthly' as communication_frequency_preference
  
FROM client_demographics;
"""

# Execute and store as table
clients_df = session.sql(clients_sql)
clients_df.write.mode("overwrite").save_as_table("clients")

client_count = session.sql("SELECT COUNT(*) as count FROM clients").collect()[0]['COUNT']
print(f"✓ Generated {client_count:,} synthetic clients")

# Show sample data
print("\nSample client data:")
session.sql("SELECT * FROM clients LIMIT 5").show()
