# Generate Continuous Prediction Customer Data

This notebook generates realistic data for customers with established transaction history.

**Continuous Scenario**: Customers with 3+ months of activity and rich behavioral data.

We'll generate:
- 50,000 customers
- 500,000+ raw transactions
- Customer interactions and engagement events
- Then derive RFM and other features from raw data

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import functions as F

# Get active Snowflake session
session = get_active_session()

random.seed(42)
np.random.seed(42)

## Configuration

Set your database and schema here:

In [None]:
# Database and schema configuration
DATABASE = 'ML_DEMO'
SCHEMA = 'PUBLIC'

# Set context
session.use_database(DATABASE)
session.use_schema(SCHEMA)

print(f"Using database: {DATABASE}")
print(f"Using schema: {SCHEMA}")
print(f"Current warehouse: {session.get_current_warehouse()}")

## Data Generation Parameters

In [None]:
NUM_CUSTOMERS = 50000
OBSERVATION_DATE = datetime(2024, 6, 30)
MIN_HISTORY_DAYS = 90
MAX_HISTORY_DAYS = 540

## Generate Customer Profiles

In [None]:
customer_segments = ['high_value', 'medium_value', 'low_value', 'at_risk', 'churned']
segment_weights = [0.15, 0.35, 0.30, 0.12, 0.08]

customers = []

for customer_id in range(1, NUM_CUSTOMERS + 1):
    history_days = random.randint(MIN_HISTORY_DAYS, MAX_HISTORY_DAYS)
    signup_date = OBSERVATION_DATE - timedelta(days=history_days)
    
    segment = np.random.choice(customer_segments, p=segment_weights)
    
    age_groups = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
    age_group = np.random.choice(age_groups, p=[0.10, 0.30, 0.27, 0.18, 0.10, 0.05])
    
    regions = ['Northeast', 'Southeast', 'Midwest', 'Southwest', 'West']
    region = np.random.choice(regions)
    
    customers.append({
        'CUSTOMER_ID': customer_id,
        'SIGNUP_DATE': signup_date,
        'AGE_GROUP': age_group,
        'REGION': region,
        'SEGMENT': segment,
        'HISTORY_DAYS': history_days
    })

customers_df = pd.DataFrame(customers)
print(f"Generated {len(customers_df)} customer profiles")
customers_df.head()

## Generate Raw Transactions

**Why raw transactions**: In real-world scenarios, we don't have pre-computed features. We start with transactional data and must derive meaningful metrics.

In [None]:
transactions = []

product_categories = ['electronics', 'clothing', 'home_goods', 'beauty', 'sports', 'books', 'toys', 'grocery']

for _, customer in customers_df.iterrows():
    customer_id = customer['CUSTOMER_ID']
    signup_date = customer['SIGNUP_DATE']
    segment = customer['SEGMENT']
    
    if segment == 'high_value':
        num_transactions = int(np.random.uniform(20, 60))
        avg_amount = np.random.uniform(100, 300)
        purchase_frequency_days = 7
    elif segment == 'medium_value':
        num_transactions = int(np.random.uniform(8, 25))
        avg_amount = np.random.uniform(50, 120)
        purchase_frequency_days = 15
    elif segment == 'low_value':
        num_transactions = int(np.random.uniform(3, 10))
        avg_amount = np.random.uniform(20, 60)
        purchase_frequency_days = 30
    elif segment == 'at_risk':
        num_transactions = int(np.random.uniform(5, 15))
        avg_amount = np.random.uniform(40, 100)
        purchase_frequency_days = 45
    else:
        num_transactions = int(np.random.uniform(1, 5))
        avg_amount = np.random.uniform(25, 70)
        purchase_frequency_days = 60
    
    current_date = signup_date + timedelta(days=random.randint(0, 7))
    
    for txn in range(num_transactions):
        days_since_last = int(np.random.exponential(purchase_frequency_days))
        current_date = current_date + timedelta(days=max(1, days_since_last))
        
        if current_date > OBSERVATION_DATE:
            break
        
        amount = max(5, np.random.gamma(shape=2, scale=avg_amount/2))
        amount = round(amount, 2)
        
        category = np.random.choice(product_categories)
        quantity = int(np.random.poisson(lam=2) + 1)
        
        transactions.append({
            'TRANSACTION_ID': len(transactions) + 1,
            'CUSTOMER_ID': customer_id,
            'TRANSACTION_DATE': current_date,
            'AMOUNT': amount,
            'PRODUCT_CATEGORY': category,
            'QUANTITY': quantity
        })

transactions_df = pd.DataFrame(transactions)
print(f"Generated {len(transactions_df)} transactions")
transactions_df.head(10)

## Generate Customer Interaction Events

In [None]:
# Vectorized transaction generation - much faster than loops
print("Generating transactions (vectorized)...")

# Pre-calculate transaction counts per customer based on segment
segment_txn_params = {
    'high_value': {'min_txns': 20, 'max_txns': 60, 'min_amt': 100, 'max_amt': 300, 'freq_days': 7},
    'medium_value': {'min_txns': 8, 'max_txns': 25, 'min_amt': 50, 'max_amt': 120, 'freq_days': 15},
    'low_value': {'min_txns': 3, 'max_txns': 10, 'min_amt': 20, 'max_amt': 60, 'freq_days': 30},
    'at_risk': {'min_txns': 5, 'max_txns': 15, 'min_amt': 40, 'max_amt': 100, 'freq_days': 45},
    'churned': {'min_txns': 1, 'max_txns': 5, 'min_amt': 25, 'max_amt': 70, 'freq_days': 60}
}

# Calculate number of transactions per customer
customers_df['num_txns'] = customers_df['SEGMENT'].apply(
    lambda s: int(np.random.uniform(segment_txn_params[s]['min_txns'], 
                                     segment_txn_params[s]['max_txns']))
)
customers_df['avg_amount'] = customers_df['SEGMENT'].apply(
    lambda s: np.random.uniform(segment_txn_params[s]['min_amt'], 
                                 segment_txn_params[s]['max_amt'])
)
customers_df['freq_days'] = customers_df['SEGMENT'].apply(
    lambda s: segment_txn_params[s]['freq_days']
)

# Expand customer data to transaction level using np.repeat
total_txns = customers_df['num_txns'].sum()
customer_ids = np.repeat(customers_df['CUSTOMER_ID'].values, customers_df['num_txns'].values)
signup_dates = np.repeat(customers_df['SIGNUP_DATE'].values, customers_df['num_txns'].values)
avg_amounts = np.repeat(customers_df['avg_amount'].values, customers_df['num_txns'].values)
freq_days = np.repeat(customers_df['freq_days'].values, customers_df['num_txns'].values)

# Generate transaction dates using exponential distribution for realistic spacing
transaction_dates = []
current_idx = 0
for _, customer in customers_df.iterrows():
    num_txns = customer['num_txns']
    signup = customer['SIGNUP_DATE']
    frequency = customer['freq_days']
    
    # Start shortly after signup
    current_date = signup + timedelta(days=random.randint(0, 7))
    dates = []
    
    for _ in range(num_txns):
        days_since_last = int(np.random.exponential(frequency))
        current_date = current_date + timedelta(days=max(1, days_since_last))
        if current_date > OBSERVATION_DATE:
            break
        dates.append(current_date)
    
    transaction_dates.extend(dates)
    current_idx += num_txns

# Truncate arrays to match actual dates generated (some may exceed OBSERVATION_DATE)
actual_txn_count = len(transaction_dates)
customer_ids = customer_ids[:actual_txn_count]
avg_amounts = avg_amounts[:actual_txn_count]

# Vectorized amount generation using gamma distribution
amounts = np.maximum(5, np.random.gamma(shape=2, scale=avg_amounts/2, size=actual_txn_count))
amounts = np.round(amounts, 2)

# Vectorized category and quantity generation
product_categories = ['electronics', 'clothing', 'home_goods', 'beauty', 'sports', 'books', 'toys', 'grocery']
categories = np.random.choice(product_categories, size=actual_txn_count)
quantities = np.random.poisson(lam=2, size=actual_txn_count) + 1

# Create transactions dataframe
transactions_df = pd.DataFrame({
    'TRANSACTION_ID': range(1, actual_txn_count + 1),
    'CUSTOMER_ID': customer_ids,
    'TRANSACTION_DATE': transaction_dates,
    'AMOUNT': amounts,
    'PRODUCT_CATEGORY': categories,
    'QUANTITY': quantities
})

print(f"Generated {len(transactions_df)} transactions")
transactions_df.head(10)

## Derive RFM Features from Raw Transactions

**RFM Metrics Explained**:
- **Recency**: Days since last purchase - Recent customers are more likely to purchase again
- **Frequency**: Number of purchases - Frequent buyers show loyalty and habit
- **Monetary**: Total/average spend - High spenders have higher lifetime value potential

These three metrics are fundamental because they capture:
1. Current engagement (Recency)
2. Behavioral patterns (Frequency)
3. Economic value (Monetary)

In [None]:
# Vectorized interaction generation - much faster than loops
print("Generating customer interactions (vectorized)...")

# Pre-calculate interaction counts per customer based on segment
segment_interaction_params = {
    'high_value': {'min_interactions': 100, 'max_interactions': 300},
    'medium_value': {'min_interactions': 50, 'max_interactions': 120},
    'low_value': {'min_interactions': 20, 'max_interactions': 60},
    'at_risk': {'min_interactions': 15, 'max_interactions': 50},
    'churned': {'min_interactions': 5, 'max_interactions': 20}
}

# Calculate number of interactions per customer
customers_df['num_interactions'] = customers_df['SEGMENT'].apply(
    lambda s: int(np.random.uniform(segment_interaction_params[s]['min_interactions'], 
                                     segment_interaction_params[s]['max_interactions']))
)

# Expand customer data to interaction level using np.repeat
total_interactions = customers_df['num_interactions'].sum()
customer_ids_int = np.repeat(customers_df['CUSTOMER_ID'].values, customers_df['num_interactions'].values)
signup_dates_int = np.repeat(customers_df['SIGNUP_DATE'].values, customers_df['num_interactions'].values)
history_days_int = np.repeat(customers_df['HISTORY_DAYS'].values, customers_df['num_interactions'].values)

# Vectorized date generation - random offset within customer history
days_offsets = np.random.randint(0, history_days_int + 1, size=total_interactions)

# Convert to pandas Series for proper datetime arithmetic
signup_series = pd.Series(signup_dates_int)
event_dates = signup_series + pd.to_timedelta(days_offsets, unit='D')

# Filter out dates beyond observation date
valid_mask = event_dates <= pd.Timestamp(OBSERVATION_DATE)
customer_ids_int = customer_ids_int[valid_mask]
event_dates = event_dates[valid_mask].tolist()

# Vectorized event type generation with realistic probabilities
event_types = ['website_visit', 'email_open', 'email_click', 'support_ticket', 'product_view', 'cart_add']
event_probs = [0.35, 0.20, 0.10, 0.05, 0.20, 0.10]
actual_interaction_count = len(event_dates)
selected_events = np.random.choice(event_types, size=actual_interaction_count, p=event_probs)

# Create interactions dataframe
interactions_df = pd.DataFrame({
    'INTERACTION_ID': range(1, actual_interaction_count + 1),
    'CUSTOMER_ID': customer_ids_int,
    'EVENT_DATE': event_dates,
    'EVENT_TYPE': selected_events
})

print(f"Generated {len(interactions_df)} customer interactions")
interactions_df.head(10)

## Derive Purchase Pattern Features

**Why these matter**:
- **Inter-purchase time**: Identifies purchase rhythm and consistency
- **Product diversity**: Customers who buy across categories tend to have higher engagement
- **Average order value**: Indicates spending capacity per transaction
- **Trend indicators**: Growing vs declining purchase patterns predict future behavior

In [None]:
purchase_patterns = []

for customer_id in customers_df['customer_id']:
    cust_txns = transactions_df[transactions_df['customer_id'] == customer_id].sort_values('transaction_date')
    
    if len(cust_txns) > 1:
        dates = pd.to_datetime(cust_txns['transaction_date'])
        inter_purchase_times = dates.diff().dt.days.dropna()
        avg_inter_purchase_days = inter_purchase_times.mean() if len(inter_purchase_times) > 0 else None
        std_inter_purchase_days = inter_purchase_times.std() if len(inter_purchase_times) > 0 else None
    else:
        avg_inter_purchase_days = None
        std_inter_purchase_days = None
    
    unique_categories = cust_txns['product_category'].nunique()
    total_quantity = cust_txns['quantity'].sum()
    
    recent_30d_txns = cust_txns[cust_txns['transaction_date'] >= (OBSERVATION_DATE - timedelta(days=30))]
    recent_30d_amount = recent_30d_txns['amount'].sum()
    recent_30d_count = len(recent_30d_txns)
    
    recent_90d_txns = cust_txns[cust_txns['transaction_date'] >= (OBSERVATION_DATE - timedelta(days=90))]
    recent_90d_amount = recent_90d_txns['amount'].sum()
    recent_90d_count = len(recent_90d_txns)
    
    if len(cust_txns) >= 4:
        mid_point = len(cust_txns) // 2
        first_half_avg = cust_txns.iloc[:mid_point]['amount'].mean()
        second_half_avg = cust_txns.iloc[mid_point:]['amount'].mean()
        spending_trend = (second_half_avg - first_half_avg) / first_half_avg if first_half_avg > 0 else 0
    else:
        spending_trend = 0
    
    purchase_patterns.append({
        'customer_id': customer_id,
        'avg_inter_purchase_days': avg_inter_purchase_days,
        'std_inter_purchase_days': std_inter_purchase_days,
        'unique_categories_purchased': unique_categories,
        'total_items_purchased': total_quantity,
        'recent_30d_amount': recent_30d_amount,
        'recent_30d_count': recent_30d_count,
        'recent_90d_amount': recent_90d_amount,
        'recent_90d_count': recent_90d_count,
        'spending_trend': spending_trend
    })

purchase_patterns_df = pd.DataFrame(purchase_patterns)
print("Purchase pattern features:")
purchase_patterns_df.head(10)

## Derive Engagement Features from Interactions

**Why these matter**: Engagement beyond purchases indicates interest and intent. High engagement without recent purchases may signal opportunity or friction.

In [None]:
engagement_features = interactions_df.groupby('customer_id').agg(
    total_interactions=('interaction_id', 'count'),
    website_visits=('event_type', lambda x: (x == 'website_visit').sum()),
    email_opens=('event_type', lambda x: (x == 'email_open').sum()),
    email_clicks=('event_type', lambda x: (x == 'email_click').sum()),
    support_tickets=('event_type', lambda x: (x == 'support_ticket').sum()),
    product_views=('event_type', lambda x: (x == 'product_view').sum()),
    cart_adds=('event_type', lambda x: (x == 'cart_add').sum())
).reset_index()

engagement_features['email_engagement_rate'] = (
    engagement_features['email_clicks'] / engagement_features['email_opens'].replace(0, np.nan)
).fillna(0)

print("Engagement features derived from interactions:")
engagement_features.head(10)

## Combine All Features into Final Dataset

In [None]:
final_df = customers_df.merge(rfm_features, on='customer_id', how='left')
final_df = final_df.merge(purchase_patterns_df, on='customer_id', how='left')
final_df = final_df.merge(engagement_features, on='customer_id', how='left')

final_df.fillna(0, inplace=True)

print(f"Final dataset shape: {final_df.shape}")
final_df.head()

## Generate Target Variable: Forward-Looking 12-Month CLV

**Why this is the label**: We predict future value, not historical. The model learns patterns that indicate future spending behavior.

In [None]:
def calculate_future_ltv(row):
    base_ltv = row['monetary_total'] * 0.6
    
    recency_factor = max(0.5, 1.5 - (row['recency_days'] / 180))
    frequency_factor = min(2.0, 1 + (row['frequency'] / 20))
    
    engagement_factor = 1 + (row['total_interactions'] / 500)
    
    trend_factor = 1 + row['spending_trend']
    trend_factor = max(0.5, min(2.0, trend_factor))
    
    future_ltv = base_ltv * recency_factor * frequency_factor * engagement_factor * trend_factor
    
    future_ltv = future_ltv * np.random.uniform(0.7, 1.3)
    
    return round(max(0, future_ltv), 2)

final_df['future_12m_ltv'] = final_df.apply(calculate_future_ltv, axis=1)

print(f"Average future 12-month LTV: ${final_df['future_12m_ltv'].mean():.2f}")
print(f"Median future 12-month LTV: ${final_df['future_12m_ltv'].median():.2f}")

## Data Quality Checks

## Save All Datasets to Snowflake

# Uppercase all column names before saving to Snowflake
customers_df.columns = customers_df.columns.str.upper()
transactions_df.columns = transactions_df.columns.str.upper()
interactions_df.columns = interactions_df.columns.str.upper()
final_df.columns = final_df.columns.str.upper()

# Save customers profile
customers_sf_df = session.create_dataframe(customers_df)
customers_sf_df.write.mode('overwrite').save_as_table('CONTINUOUS_CUSTOMERS_PROFILE')
print(f"✓ Saved CONTINUOUS_CUSTOMERS_PROFILE: {customers_sf_df.count()} rows")

# Save transactions
transactions_sf_df = session.create_dataframe(transactions_df)
transactions_sf_df.write.mode('overwrite').save_as_table('CONTINUOUS_TRANSACTIONS')
print(f"✓ Saved CONTINUOUS_TRANSACTIONS: {transactions_sf_df.count()} rows")

# Save interactions
interactions_sf_df = session.create_dataframe(interactions_df)
interactions_sf_df.write.mode('overwrite').save_as_table('CONTINUOUS_INTERACTIONS')
print(f"✓ Saved CONTINUOUS_INTERACTIONS: {interactions_sf_df.count()} rows")

# Save final feature dataset
features_sf_df = session.create_dataframe(final_df)
features_sf_df.write.mode('overwrite').save_as_table('CONTINUOUS_CUSTOMERS_FEATURES')
print(f"✓ Saved CONTINUOUS_CUSTOMERS_FEATURES: {features_sf_df.count()} rows")

print(f"\n✓ All datasets saved to {DATABASE}.{SCHEMA}")

## Verify Tables in Snowflake

In [None]:
# Show table information
tables = session.sql(f"""
    SELECT table_name, row_count 
    FROM {DATABASE}.INFORMATION_SCHEMA.TABLES 
    WHERE table_schema = '{SCHEMA}' 
    AND table_name LIKE 'CONTINUOUS%'
    ORDER BY table_name
""").to_pandas()

print("\nCreated tables:")
print(tables)

# Display sample from features table
print("\nSample data from CONTINUOUS_CUSTOMERS_FEATURES:")
session.table('CONTINUOUS_CUSTOMERS_FEATURES').limit(10).show()