# Generate Continuous Prediction Customer Data

This notebook generates realistic data for customers with established transaction history.

**Continuous Scenario**: Customers with 3+ months of activity and rich behavioral data.

We'll generate:
- 50,000 customers
- 500,000+ raw transactions
- Customer interactions and engagement events
- Then derive RFM and other features from raw data

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import functions as F

# Get active Snowflake session
session = get_active_session()

random.seed(42)
np.random.seed(42)

## Configuration

Set your database and schema here:

In [None]:
# Database and schema configuration
DATABASE = 'ML_DEMO'
SCHEMA = 'PUBLIC'

# Set context
session.use_database(DATABASE)
session.use_schema(SCHEMA)

print(f"Using database: {DATABASE}")
print(f"Using schema: {SCHEMA}")
print(f"Current warehouse: {session.get_current_warehouse()}")

## Data Generation Parameters

In [None]:
NUM_CUSTOMERS = 50000
OBSERVATION_DATE = datetime(2024, 6, 30)
MIN_HISTORY_DAYS = 90
MAX_HISTORY_DAYS = 540

## Generate Customer Profiles

In [None]:
customer_segments = ['high_value', 'medium_value', 'low_value', 'at_risk', 'churned']
segment_weights = [0.15, 0.35, 0.30, 0.12, 0.08]

customers = []

for customer_id in range(1, NUM_CUSTOMERS + 1):
    history_days = random.randint(MIN_HISTORY_DAYS, MAX_HISTORY_DAYS)
    signup_date = OBSERVATION_DATE - timedelta(days=history_days)
    
    segment = np.random.choice(customer_segments, p=segment_weights)
    
    age_groups = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
    age_group = np.random.choice(age_groups, p=[0.10, 0.30, 0.27, 0.18, 0.10, 0.05])
    
    regions = ['Northeast', 'Southeast', 'Midwest', 'Southwest', 'West']
    region = np.random.choice(regions)
    
    customers.append({
        'CUSTOMER_ID': customer_id,
        'SIGNUP_DATE': signup_date,
        'AGE_GROUP': age_group,
        'REGION': region,
        'SEGMENT': segment,
        'HISTORY_DAYS': history_days
    })

customers_df = pd.DataFrame(customers)
print(f"Generated {len(customers_df)} customer profiles")
customers_df.head()

## Generate Raw Transactions

**Why raw transactions**: In real-world scenarios, we don't have pre-computed features. We start with transactional data and must derive meaningful metrics.

In [None]:
transactions = []

product_categories = ['electronics', 'clothing', 'home_goods', 'beauty', 'sports', 'books', 'toys', 'grocery']

for _, customer in customers_df.iterrows():
    customer_id = customer['CUSTOMER_ID']
    signup_date = customer['SIGNUP_DATE']
    segment = customer['SEGMENT']
    
    if segment == 'HIGH_VALUE':
        num_transactions = int(np.random.uniform(20, 60))
        avg_amount = np.random.uniform(100, 300)
        purchase_frequency_days = 7
    elif segment == 'MEDIUM_VALUE':
        num_transactions = int(np.random.uniform(8, 25))
        avg_amount = np.random.uniform(50, 120)
        purchase_frequency_days = 15
    elif segment == 'LOW_VALUE':
        num_transactions = int(np.random.uniform(3, 10))
        avg_amount = np.random.uniform(20, 60)
        purchase_frequency_days = 30
    elif segment == 'AT_RISK':
        num_transactions = int(np.random.uniform(5, 15))
        avg_amount = np.random.uniform(40, 100)
        purchase_frequency_days = 45
    else:
        num_transactions = int(np.random.uniform(1, 5))
        avg_amount = np.random.uniform(25, 70)
        purchase_frequency_days = 60
    
    current_date = signup_date + timedelta(days=random.randint(0, 7))
    
    for txn in range(num_transactions):
        days_since_last = int(np.random.exponential(purchase_frequency_days))
        current_date = current_date + timedelta(days=max(1, days_since_last))
        
        if current_date > OBSERVATION_DATE:
            break
        
        amount = max(5, np.random.gamma(shape=2, scale=avg_amount/2))
        amount = round(amount, 2)
        
        category = np.random.choice(product_categories)
        quantity = int(np.random.poisson(lam=2) + 1)
        
        transactions.append({
            'TRANSACTION_ID': len(transactions) + 1,
            'CUSTOMER_ID': customer_id,
            'TRANSACTION_DATE': current_date,
            'AMOUNT': amount,
            'PRODUCT_CATEGORY': category,
            'QUANTITY': quantity
        })

transactions_df = pd.DataFrame(transactions)
print(f"Generated {len(transactions_df)} transactions")
transactions_df.head(10)

## Generate Customer Interaction Events

In [None]:
interactions = []
event_types = ['website_visit', 'email_open', 'email_click', 'support_ticket', 'product_view', 'cart_add']

for _, customer in customers_df.iterrows():
    customer_id = customer['CUSTOMER_ID']
    signup_date = customer['SIGNUP_DATE']
    segment = customer['SEGMENT']
    
    if segment == 'HIGH_VALUE':
        num_interactions = int(np.random.uniform(100, 300))
    elif segment == 'MEDIUM_VALUE':
        num_interactions = int(np.random.uniform(50, 120))
    elif segment == 'LOW_VALUE':
        num_interactions = int(np.random.uniform(20, 60))
    elif segment == 'AT_RISK':
        num_interactions = int(np.random.uniform(15, 50))
    else:
        num_interactions = int(np.random.uniform(5, 20))
    
    for _ in range(num_interactions):
        days_offset = random.randint(0, customer['HISTORY_DAYS'])
        event_date = signup_date + timedelta(days=days_offset)
        
        if event_date > OBSERVATION_DATE:
            continue
        
        event_type = np.random.choice(event_types, p=[0.35, 0.20, 0.10, 0.05, 0.20, 0.10])
        
        interactions.append({
            'INTERACTION_ID': len(interactions) + 1,
            'CUSTOMER_ID': customer_id,
            'EVENT_DATE': event_date,
            'EVENT_TYPE': event_type
        })

interactions_df = pd.DataFrame(interactions)
print(f"Generated {len(interactions_df)} customer interactions")
interactions_df.head(10)

## Save Raw Datasets to Snowflake


In [None]:
# Save customers profile
customers_sf_df = session.create_dataframe(customers_df)
customers_sf_df.write.mode('overwrite').save_as_table('CONTINUOUS_CUSTOMERS_PROFILE')
print(f"✓ Saved CONTINUOUS_CUSTOMERS_PROFILE: {customers_sf_df.count()} rows")

# Save transactions
transactions_sf_df = session.create_dataframe(transactions_df)
transactions_sf_df.write.mode('overwrite').save_as_table('CONTINUOUS_TRANSACTIONS')
print(f"✓ Saved CONTINUOUS_TRANSACTIONS: {transactions_sf_df.count()} rows")

# Save interactions
interactions_sf_df = session.create_dataframe(interactions_df)
interactions_sf_df.write.mode('overwrite').save_as_table('CONTINUOUS_INTERACTIONS')
print(f"✓ Saved CONTINUOUS_INTERACTIONS: {interactions_sf_df.count()} rows")

print(f"\n✓ All raw datasets saved to {DATABASE}.{SCHEMA}")


## Verify Tables in Snowflake

In [None]:
# Show table information
tables = session.sql(f"""
    SELECT table_name, row_count 
    FROM {DATABASE}.INFORMATION_SCHEMA.TABLES 
    WHERE table_schema = '{SCHEMA}' 
    AND table_name IN ('CONTINUOUS_CUSTOMERS_PROFILE', 'CONTINUOUS_TRANSACTIONS', 'CONTINUOUS_INTERACTIONS')
    ORDER BY table_name
""").to_pandas()

print("\nCreated raw data tables:")
print(tables)

# Display sample from each table
print("\nSample from CONTINUOUS_CUSTOMERS_PROFILE:")
session.table('CONTINUOUS_CUSTOMERS_PROFILE').limit(5).show()

print("\nSample from CONTINUOUS_TRANSACTIONS:")
session.table('CONTINUOUS_TRANSACTIONS').limit(5).show()

print("\nSample from CONTINUOUS_INTERACTIONS:")
session.table('CONTINUOUS_INTERACTIONS').limit(5).show()


## Summary


In [None]:
print("This notebook generated raw data for CLV modeling:")
print("  - Customer profiles with demographics and segments")
print("  - Transaction history with product categories and amounts")
print("  - Customer interaction events (web, email, support)")
print("")
print("Next step: Run feature_engineering_continuous.ipynb to derive features using Feature Store")
