# Generate Cold Start Customer Data

This notebook generates realistic cold start customer data for training a CLV prediction model.

**Cold Start Scenario**: New customers with minimal behavioral history (first 7-30 days).

We'll generate 50,000 customer records with acquisition and early engagement features.

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import functions as F

# Get active Snowflake session
session = get_active_session()

random.seed(42)
np.random.seed(42)

## Configuration

Set your database and schema here:

In [None]:
# Database and schema configuration
DATABASE = 'ML_DEMO'
SCHEMA = 'PUBLIC'

# Set context
session.use_database(DATABASE)
session.use_schema(SCHEMA)

print(f"Using database: {DATABASE}")
print(f"Using schema: {SCHEMA}")
print(f"Current warehouse: {session.get_current_warehouse()}")

## Data Generation Parameters

In [None]:
NUM_CUSTOMERS = 50000
START_DATE = datetime(2023, 1, 1)
END_DATE = datetime(2024, 6, 30)

## Feature Generation

### Acquisition Features
**Why these matter**: Acquisition channel and source indicate customer quality and intent. Paid ads often bring price-sensitive customers, while organic search may bring higher-intent customers.

In [None]:
channels = ['organic_search', 'paid_search', 'social_media', 'email', 'direct', 'referral', 'affiliate']
channel_weights = [0.25, 0.20, 0.18, 0.12, 0.10, 0.10, 0.05]

sources = ['google', 'facebook', 'instagram', 'email_campaign', 'partner_site', 'direct_url', 'friend_referral']
devices = ['desktop', 'mobile', 'tablet']
device_weights = [0.45, 0.45, 0.10]

### Demographic Features
**Why these matter**: Age and geography correlate with spending power and product preferences. Urban customers often have higher CLV.

In [None]:
age_groups = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
age_weights = [0.12, 0.28, 0.25, 0.18, 0.12, 0.05]

regions = ['Northeast', 'Southeast', 'Midwest', 'Southwest', 'West']
region_weights = [0.22, 0.20, 0.18, 0.18, 0.22]

area_types = ['urban', 'suburban', 'rural']
area_weights = [0.40, 0.45, 0.15]

## Generate Customer Records

In [None]:
data = []

for customer_id in range(1, NUM_CUSTOMERS + 1):
    signup_date = START_DATE + timedelta(days=random.randint(0, (END_DATE - START_DATE).days))
    
    channel = np.random.choice(channels, p=channel_weights)
    source = np.random.choice(sources)
    device = np.random.choice(devices, p=device_weights)
    
    age_group = np.random.choice(age_groups, p=age_weights)
    region = np.random.choice(regions, p=region_weights)
    area_type = np.random.choice(area_types, p=area_weights)
    
    channel_quality = {
        'organic_search': 1.3,
        'paid_search': 0.9,
        'social_media': 0.8,
        'email': 1.2,
        'direct': 1.4,
        'referral': 1.5,
        'affiliate': 1.0
    }[channel]
    
    age_factor = {
        '18-24': 0.7,
        '25-34': 1.2,
        '35-44': 1.4,
        '45-54': 1.3,
        '55-64': 1.1,
        '65+': 0.9
    }[age_group]
    
    area_factor = {'urban': 1.2, 'suburban': 1.0, 'rural': 0.85}[area_type]
    
    days_to_first_purchase = max(0, int(np.random.exponential(3) * (2.0 if channel in ['paid_search', 'social_media'] else 1.0)))
    days_to_first_purchase = min(days_to_first_purchase, 30)
    
    made_first_purchase = days_to_first_purchase <= 14 or random.random() < 0.65
    
    if made_first_purchase:
        first_purchase_date = signup_date + timedelta(days=days_to_first_purchase)
        
        base_amount = np.random.gamma(shape=2, scale=30)
        first_purchase_amount = base_amount * channel_quality * age_factor * area_factor
        first_purchase_amount = round(max(10, min(first_purchase_amount, 500)), 2)
        
        product_categories = ['electronics', 'clothing', 'home_goods', 'beauty', 'sports', 'books']
        first_purchase_category = np.random.choice(product_categories)
    else:
        first_purchase_date = None
        first_purchase_amount = 0
        first_purchase_category = None
    
    website_visits_30d = int(np.random.poisson(lam=8 if made_first_purchase else 3))
    email_opens_30d = int(np.random.poisson(lam=3 if made_first_purchase else 1))
    email_clicks_30d = int(np.random.binomial(n=email_opens_30d, p=0.3))
    
    items_viewed_30d = int(np.random.poisson(lam=15 if made_first_purchase else 5))
    cart_adds_30d = int(np.random.poisson(lam=4 if made_first_purchase else 1))
    
    base_ltv = 200 * channel_quality * age_factor * area_factor
    
    if made_first_purchase:
        purchase_boost = first_purchase_amount * 4
        engagement_boost = (website_visits_30d * 2 + email_clicks_30d * 5 + cart_adds_30d * 3)
        actual_12m_ltv = base_ltv + purchase_boost + engagement_boost
    else:
        actual_12m_ltv = base_ltv * 0.1 + (website_visits_30d * 1 + email_opens_30d * 2)
    
    actual_12m_ltv = actual_12m_ltv * np.random.uniform(0.7, 1.3)
    actual_12m_ltv = round(max(0, actual_12m_ltv), 2)
    
    data.append({
        'customer_id': customer_id,
        'signup_date': signup_date,
        'acquisition_channel': channel,
        'acquisition_source': source,
        'device_type': device,
        'age_group': age_group,
        'region': region,
        'area_type': area_type,
        'days_to_first_purchase': days_to_first_purchase if made_first_purchase else None,
        'first_purchase_date': first_purchase_date,
        'first_purchase_amount': first_purchase_amount,
        'first_purchase_category': first_purchase_category,
        'website_visits_30d': website_visits_30d,
        'email_opens_30d': email_opens_30d,
        'email_clicks_30d': email_clicks_30d,
        'items_viewed_30d': items_viewed_30d,
        'cart_adds_30d': cart_adds_30d,
        'actual_12m_ltv': actual_12m_ltv
    })

df = pd.DataFrame(data)

## Data Quality Checks

In [None]:
print(f"Total customers: {len(df)}")
print(f"Customers with first purchase: {df['first_purchase_amount'].gt(0).sum()}")
print(f"Average first purchase amount: ${df[df['first_purchase_amount'] > 0]['first_purchase_amount'].mean():.2f}")
print(f"Average 12-month LTV: ${df['actual_12m_ltv'].mean():.2f}")
print(f"Median 12-month LTV: ${df['actual_12m_ltv'].median():.2f}")

print("\nChannel distribution:")
print(df['acquisition_channel'].value_counts())

print("\nAge group distribution:")
print(df['age_group'].value_counts())

df.head(10)

## Save to Snowflake Table

In [None]:
# Uppercase all column names before saving to Snowflake
df.columns = df.columns.str.upper()

# Create Snowpark DataFrame from pandas
coldstart_df = session.create_dataframe(df)

# Write to Snowflake table
table_name = 'COLDSTART_CUSTOMERS'
coldstart_df.write.mode('overwrite').save_as_table(table_name)

print(f"\n✓ Data saved to {DATABASE}.{SCHEMA}.{table_name}")
print(f"  Total rows: {coldstart_df.count()}")

# Verify the table was created
result = session.sql(f"SELECT COUNT(*) as row_count FROM {table_name}").collect()
print(f"  Verified row count: {result[0]['ROW_COUNT']}")

## Query and Display Sample Data from Snowflake

In [None]:
# Read back from Snowflake and display
sample_data = session.table(table_name).limit(10).to_pandas()
print("\nSample data from Snowflake:")
sample_data

## Summary Statistics

In [None]:
df.describe()