# Financial Services ML Pipeline - Data Generation
## Running in Snowflake Notebooks

This notebook generates synthetic financial services data directly in Snowflake using Snowpark and SQL.

## What We'll Build
- 50,000 realistic client profiles
- 7.5M marketing events across multiple channels
- Advisor directory and relationship data

## Prerequisites
- Snowflake account with Snowpark enabled
- Database and schema created (run snowflake_setup.sql first)
- ML_WAREHOUSE available


In [None]:
# Import required libraries
import snowflake.snowpark as snowpark
from snowflake.snowpark import Session
from snowflake.snowpark.functions import *
from snowflake.snowpark.types import *
import pandas as pd
import numpy as np

# Get the active Snowflake session
session = snowpark.session._get_active_session()

print(f"Connected to Snowflake!")
print(f"Current database: {session.get_current_database()}")
print(f"Current schema: {session.get_current_schema()}")
print(f"Current warehouse: {session.get_current_warehouse()}")


## Step 1: Generate Client Demographics


In [None]:
# Generate 50,000 synthetic clients using Snowflake SQL
clients_sql = """
WITH client_demographics AS (
  SELECT 
    'client_' || LPAD(seq8(), 8, '0') as client_id,
    CURRENT_TIMESTAMP() as created_date,
    
    -- Demographics with realistic distributions
    ROUND(NORMAL(45, 12, RANDOM()) + 0.5) as age_raw,
    CASE 
      WHEN age_raw < 25 THEN 25
      WHEN age_raw > 70 THEN 70
      ELSE age_raw
    END as age,
    
    CASE 
      WHEN UNIFORM(0, 1, RANDOM()) < 0.48 THEN 'M'
      WHEN UNIFORM(0, 1, RANDOM()) < 0.96 THEN 'F' 
      ELSE 'Other'
    END as gender,
    
    -- Income with log-normal distribution
    ROUND(EXP(NORMAL(10.8, 0.5, RANDOM()))) as annual_income_raw,
    CASE 
      WHEN annual_income_raw < 25000 THEN 25000
      WHEN annual_income_raw > 500000 THEN 500000
      ELSE annual_income_raw
    END as annual_income
    
  FROM TABLE(GENERATOR(ROWCOUNT => 50000))
)

SELECT 
  client_id,
  created_date,
  age,
  gender,
  'Married' as marital_status,
  'Bachelor' as education_level,
  'Employed' as employment_status,
  'Software Engineer' as occupation,
  annual_income,
  'CA' as state,
  'San Francisco' as city,
  '94105' as zip_code,
  -- Calculate 401k balance based on age and income
  ROUND((age - 25) * annual_income * 0.08 * POWER(1.07, (age - 25) / 2)) as current_401k_balance,
  GREATEST(0, 65 - age) as years_to_retirement,
  'Moderate' as risk_tolerance,
  'Intermediate' as investment_experience,
  ARRAY_CONSTRUCT('Retirement Planning', 'Wealth Building') as financial_goals,
  UNIFORM(1, 60, RANDOM()) as client_tenure_months,
  'ADV_' || LPAD(UNIFORM(1, 50, RANDOM()), 3, '0') as assigned_advisor_id,
  CASE 
    WHEN UNIFORM(0, 1, RANDOM()) < 0.60 THEN 'Basic'
    WHEN UNIFORM(0, 1, RANDOM()) < 0.90 THEN 'Premium'
    ELSE 'Elite'
  END as service_tier,
  ROUND(current_401k_balance * UNIFORM(1.0, 3.5, RANDOM())) as total_assets_under_management,
  'Email' as preferred_contact_method,
  DATEADD(day, -UNIFORM(1, 90, RANDOM()), CURRENT_DATE()) as last_contact_date,
  'Monthly' as communication_frequency_preference
  
FROM client_demographics;
"""

# Execute and store as table
clients_df = session.sql(clients_sql)
clients_df.write.mode("overwrite").save_as_table("clients")

client_count = session.sql("SELECT COUNT(*) as count FROM clients").collect()[0]['COUNT']
print(f"✓ Generated {client_count:,} synthetic clients")

# Show sample data
print("\nSample client data:")
session.sql("SELECT * FROM clients LIMIT 5").show()


## Step 3: Generate Marketing Events Data


In [None]:
# Generate comprehensive marketing events data with ALL required columns
print("📈 Generating marketing events data with time_on_page and all features...")

# First, drop the existing incomplete table
session.sql("DROP TABLE IF EXISTS marketing_events").collect()
print("🗑️ Dropped existing marketing_events table")

# Create marketing events with ALL columns needed for feature engineering
marketing_events_sql = """
CREATE TABLE marketing_events AS
WITH client_sample AS (
  SELECT client_id, annual_income, current_401k_balance, age, service_tier
  FROM clients
),

event_generation AS (
  SELECT 
    'event_' || UNIFORM(1, 10000000, RANDOM()) as event_id,
    cs.client_id,
    
    -- Event timing (last 90 days with realistic patterns)
    DATEADD(
      minute, 
      -UNIFORM(1, 129600, RANDOM()), -- Random within 90 days
      CURRENT_TIMESTAMP()
    ) as event_timestamp,
    
    -- Event types with realistic probabilities
    CASE 
      WHEN UNIFORM(0, 1, RANDOM()) < 0.35 THEN 'web_visit'
      WHEN UNIFORM(0, 1, RANDOM()) < 0.55 THEN 'email_open'
      WHEN UNIFORM(0, 1, RANDOM()) < 0.70 THEN 'page_view'
      WHEN UNIFORM(0, 1, RANDOM()) < 0.80 THEN 'email_click'
      WHEN UNIFORM(0, 1, RANDOM()) < 0.88 THEN 'search'
      WHEN UNIFORM(0, 1, RANDOM()) < 0.94 THEN 'login'
      WHEN UNIFORM(0, 1, RANDOM()) < 0.97 THEN 'document_download'
      ELSE 'advisor_meeting'
    END as event_type,
    
    -- Event categories
    CASE 
      WHEN event_type IN ('web_visit', 'page_view', 'search', 'login') THEN 'Digital'
      WHEN event_type IN ('email_open', 'email_click') THEN 'Email'
      WHEN event_type = 'document_download' THEN 'Educational'
      WHEN event_type = 'advisor_meeting' THEN 'Personal'
      ELSE 'Other'
    END as event_category,
    
    -- Channels
    CASE 
      WHEN event_type IN ('web_visit', 'page_view', 'search', 'login') THEN 'Website'
      WHEN event_type IN ('email_open', 'email_click') THEN 'Email'
      WHEN event_type = 'document_download' THEN 'Portal'
      WHEN event_type = 'advisor_meeting' THEN 'In-Person'
      ELSE 'Other'
    END as channel,
    
    -- Device types (realistic distribution)
    CASE 
      WHEN UNIFORM(0, 1, RANDOM()) < 0.55 THEN 'Desktop'
      WHEN UNIFORM(0, 1, RANDOM()) < 0.90 THEN 'Mobile'
      ELSE 'Tablet'
    END as device_type,
    
    -- Page/Content URLs
    CASE event_type
      WHEN 'web_visit' THEN 
        CASE 
          WHEN UNIFORM(0, 1, RANDOM()) < 0.2 THEN '/home'
          WHEN UNIFORM(0, 1, RANDOM()) < 0.4 THEN '/retirement-planning'
          WHEN UNIFORM(0, 1, RANDOM()) < 0.6 THEN '/401k-services'
          WHEN UNIFORM(0, 1, RANDOM()) < 0.75 THEN '/investment-options'
          WHEN UNIFORM(0, 1, RANDOM()) < 0.85 THEN '/advisor-directory'
          WHEN UNIFORM(0, 1, RANDOM()) < 0.95 THEN '/calculators/retirement'
          ELSE '/contact'
        END
      WHEN 'page_view' THEN 
        CASE 
          WHEN UNIFORM(0, 1, RANDOM()) < 0.3 THEN '/dashboard'
          WHEN UNIFORM(0, 1, RANDOM()) < 0.5 THEN '/portfolio'
          WHEN UNIFORM(0, 1, RANDOM()) < 0.7 THEN '/transactions'
          WHEN UNIFORM(0, 1, RANDOM()) < 0.85 THEN '/reports'
          ELSE '/settings'
        END
      ELSE NULL
    END as page_url,
    
    -- *** TIME_ON_PAGE - This is the key column for feature engineering ***
    CASE 
      WHEN event_type IN ('web_visit', 'page_view') THEN 
        -- Realistic time distributions: quick visits vs engaged sessions
        CASE 
          WHEN UNIFORM(0, 1, RANDOM()) < 0.3 THEN UNIFORM(15, 60, RANDOM())    -- Quick visits (30%)
          WHEN UNIFORM(0, 1, RANDOM()) < 0.7 THEN UNIFORM(60, 300, RANDOM())   -- Normal sessions (40%)  
          ELSE UNIFORM(300, 900, RANDOM())                                     -- Engaged sessions (30%)
        END
      WHEN event_type = 'search' THEN UNIFORM(10, 120, RANDOM())             -- Search sessions
      WHEN event_type = 'login' THEN UNIFORM(60, 1800, RANDOM())             -- Portal sessions
      ELSE NULL
    END as time_on_page,
    
    -- Email campaign details
    CASE 
      WHEN event_type IN ('email_open', 'email_click') THEN 
        CASE 
          WHEN UNIFORM(0, 1, RANDOM()) < 0.3 THEN 'Weekly Market Update'
          WHEN UNIFORM(0, 1, RANDOM()) < 0.5 THEN 'Retirement Tips Newsletter'
          WHEN UNIFORM(0, 1, RANDOM()) < 0.7 THEN 'Investment Opportunity Alert'
          WHEN UNIFORM(0, 1, RANDOM()) < 0.85 THEN 'Educational Webinar Invitation'
          ELSE 'Account Review Reminder'
        END
      ELSE NULL
    END as campaign_name,
    
    -- Touchpoint value (weighted by client value and event type)
    ROUND(
      CASE event_type
        WHEN 'advisor_meeting' THEN UNIFORM(0.8, 1.0, RANDOM())
        WHEN 'document_download' THEN UNIFORM(0.6, 0.9, RANDOM())
        WHEN 'email_click' THEN UNIFORM(0.4, 0.7, RANDOM())
        WHEN 'login' THEN UNIFORM(0.3, 0.6, RANDOM())
        WHEN 'web_visit' THEN UNIFORM(0.2, 0.5, RANDOM())
        WHEN 'page_view' THEN UNIFORM(0.1, 0.4, RANDOM())
        WHEN 'email_open' THEN UNIFORM(0.1, 0.3, RANDOM())
        ELSE UNIFORM(0.1, 0.5, RANDOM())
      END * 
      CASE 
        WHEN cs.service_tier = 'Elite' THEN 1.5
        WHEN cs.service_tier = 'Premium' THEN 1.2
        ELSE 1.0
      END, 4
    ) as touchpoint_value,
    
    -- Conversion flags (higher probability for valuable events and clients)
    CASE 
      WHEN event_type = 'advisor_meeting' AND UNIFORM(0, 1, RANDOM()) < 0.15 THEN TRUE
      WHEN event_type = 'document_download' AND cs.annual_income > 100000 AND UNIFORM(0, 1, RANDOM()) < 0.08 THEN TRUE
      WHEN event_type = 'email_click' AND cs.current_401k_balance > 100000 AND UNIFORM(0, 1, RANDOM()) < 0.05 THEN TRUE
      WHEN cs.service_tier = 'Elite' AND UNIFORM(0, 1, RANDOM()) < 0.03 THEN TRUE
      ELSE FALSE
    END as conversion_flag
    
  FROM client_sample cs,
       TABLE(GENERATOR(ROWCOUNT => 150)) -- Generate ~150 events per client on average
  
  -- Create realistic event frequency distribution
  WHERE UNIFORM(0, 1, RANDOM()) < 
    CASE 
      WHEN cs.service_tier = 'Elite' THEN 0.95    -- Elite clients very active
      WHEN cs.service_tier = 'Premium' THEN 0.75  -- Premium clients moderately active  
      WHEN cs.annual_income > 100000 THEN 0.65    -- High income clients active
      WHEN cs.age BETWEEN 35 AND 55 THEN 0.60     -- Mid-career clients active
      ELSE 0.45                                   -- Others less active
    END
)

SELECT * FROM event_generation
WHERE event_timestamp >= DATEADD(day, -90, CURRENT_TIMESTAMP())
ORDER BY client_id, event_timestamp
"""

# Execute marketing events generation
session.sql(marketing_events_sql).collect()

# Verify results and show table structure
event_count = session.sql("SELECT COUNT(*) as count FROM marketing_events").collect()[0]['COUNT']
unique_clients = session.sql("SELECT COUNT(DISTINCT client_id) as count FROM marketing_events").collect()[0]['COUNT']

print(f"✅ Generated {event_count:,} marketing events with ALL required columns")
print(f"📊 Events across {unique_clients:,} unique clients")
print(f"📈 Average events per client: {event_count/unique_clients:.1f}")

# Show the table structure to confirm all columns exist
print("\n🔍 Marketing Events Table Structure (with time_on_page):")
session.sql("DESCRIBE TABLE marketing_events").show()

# Show time_on_page statistics
print("\n⏰ Time on Page Statistics:")
session.sql("""
    SELECT 
        event_type,
        COUNT(*) as events_with_time,
        ROUND(AVG(time_on_page), 1) as avg_time_seconds,
        ROUND(MIN(time_on_page), 1) as min_time_seconds,
        ROUND(MAX(time_on_page), 1) as max_time_seconds
    FROM marketing_events 
    WHERE time_on_page IS NOT NULL
    GROUP BY event_type
    ORDER BY avg_time_seconds DESC
""").show()

# Show sample events with time_on_page
print("\n📋 Sample events with time_on_page:")
session.sql("""
    SELECT event_type, channel, time_on_page, touchpoint_value, conversion_flag
    FROM marketing_events 
    WHERE time_on_page IS NOT NULL
    ORDER BY event_timestamp DESC 
    LIMIT 10
""").show()
