# Generate Continuous Prediction Customer Data

This notebook generates realistic raw data for customers with established transaction history.

**Continuous Scenario**: Customers with 3+ months of activity and rich behavioral data.

We'll generate and save to Snowflake:
- 50,000 customers
- 500,000+ raw transactions
- Customer interactions and engagement events

Feature engineering (RFM, patterns, etc.) will be done in a separate notebook.

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import functions as F

session = get_active_session()

random.seed(42)
np.random.seed(42)

## Configuration

Set your database and schema here:

In [None]:
DATABASE = 'ML_DEMO'
SCHEMA = 'PUBLIC'

session.use_database(DATABASE)
session.use_schema(SCHEMA)

print(f"Using database: {DATABASE}")
print(f"Using schema: {SCHEMA}")
print(f"Current warehouse: {session.get_current_warehouse()}")

## Data Generation Parameters

In [None]:
NUM_CUSTOMERS = 50000
OBSERVATION_DATE = datetime(2024, 6, 30)
MIN_HISTORY_DAYS = 90
MAX_HISTORY_DAYS = 540

## Generate Customer Profiles

In [None]:
customer_segments = ['high_value', 'medium_value', 'low_value', 'at_risk', 'churned']
segment_weights = [0.15, 0.35, 0.30, 0.12, 0.08]

customers = []

for customer_id in range(1, NUM_CUSTOMERS + 1):
    history_days = random.randint(MIN_HISTORY_DAYS, MAX_HISTORY_DAYS)
    signup_date = OBSERVATION_DATE - timedelta(days=history_days)
    
    segment = np.random.choice(customer_segments, p=segment_weights)
    
    age_groups = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
    age_group = np.random.choice(age_groups, p=[0.10, 0.30, 0.27, 0.18, 0.10, 0.05])
    
    regions = ['Northeast', 'Southeast', 'Midwest', 'Southwest', 'West']
    region = np.random.choice(regions)
    
    customers.append({
        'customer_id': customer_id,
        'signup_date': signup_date,
        'age_group': age_group,
        'region': region,
        'segment': segment,
        'history_days': history_days
    })

customers_df = pd.DataFrame(customers)
print(f"Generated {len(customers_df)} customer profiles")
customers_df.head()

## Generate Raw Transactions (Vectorized)

**Why raw transactions**: In real-world scenarios, we don't have pre-computed features. We start with transactional data and must derive meaningful metrics.

In [None]:
print("Generating transactions (vectorized)...")

segment_txn_params = {
    'high_value': {'min_txns': 20, 'max_txns': 60, 'min_amt': 100, 'max_amt': 300, 'freq_days': 7},
    'medium_value': {'min_txns': 8, 'max_txns': 25, 'min_amt': 50, 'max_amt': 120, 'freq_days': 15},
    'low_value': {'min_txns': 3, 'max_txns': 10, 'min_amt': 20, 'max_amt': 60, 'freq_days': 30},
    'at_risk': {'min_txns': 5, 'max_txns': 15, 'min_amt': 40, 'max_amt': 100, 'freq_days': 45},
    'churned': {'min_txns': 1, 'max_txns': 5, 'min_amt': 25, 'max_amt': 70, 'freq_days': 60}
}

customers_df['num_txns'] = customers_df['segment'].apply(
    lambda s: int(np.random.uniform(segment_txn_params[s]['min_txns'], 
                                     segment_txn_params[s]['max_txns']))
)
customers_df['avg_amount'] = customers_df['segment'].apply(
    lambda s: np.random.uniform(segment_txn_params[s]['min_amt'], 
                                 segment_txn_params[s]['max_amt'])
)
customers_df['freq_days'] = customers_df['segment'].apply(
    lambda s: segment_txn_params[s]['freq_days']
)

total_txns = customers_df['num_txns'].sum()
customer_ids = np.repeat(customers_df['customer_id'].values, customers_df['num_txns'].values)
signup_dates = np.repeat(customers_df['signup_date'].values, customers_df['num_txns'].values)
avg_amounts = np.repeat(customers_df['avg_amount'].values, customers_df['num_txns'].values)
freq_days = np.repeat(customers_df['freq_days'].values, customers_df['num_txns'].values)

transaction_dates = []
for _, customer in customers_df.iterrows():
    num_txns = customer['num_txns']
    signup = customer['signup_date']
    frequency = customer['freq_days']
    
    current_date = signup + timedelta(days=random.randint(0, 7))
    dates = []
    
    for _ in range(num_txns):
        days_since_last = int(np.random.exponential(frequency))
        current_date = current_date + timedelta(days=max(1, days_since_last))
        if current_date > OBSERVATION_DATE:
            break
        dates.append(current_date)
    
    transaction_dates.extend(dates)

actual_txn_count = len(transaction_dates)
customer_ids = customer_ids[:actual_txn_count]
avg_amounts = avg_amounts[:actual_txn_count]

amounts = np.maximum(5, np.random.gamma(shape=2, scale=avg_amounts/2, size=actual_txn_count))
amounts = np.round(amounts, 2)

product_categories = ['electronics', 'clothing', 'home_goods', 'beauty', 'sports', 'books', 'toys', 'grocery']
categories = np.random.choice(product_categories, size=actual_txn_count)
quantities = np.random.poisson(lam=2, size=actual_txn_count) + 1

transactions_df = pd.DataFrame({
    'transaction_id': range(1, actual_txn_count + 1),
    'customer_id': customer_ids,
    'transaction_date': transaction_dates,
    'amount': amounts,
    'product_category': categories,
    'quantity': quantities
})

print(f"Generated {len(transactions_df)} transactions")
transactions_df.head(10)

## Generate Customer Interaction Events (Vectorized)

In [None]:
print("Generating customer interactions (vectorized)...")

segment_interaction_params = {
    'high_value': {'min_interactions': 100, 'max_interactions': 300},
    'medium_value': {'min_interactions': 50, 'max_interactions': 120},
    'low_value': {'min_interactions': 20, 'max_interactions': 60},
    'at_risk': {'min_interactions': 15, 'max_interactions': 50},
    'churned': {'min_interactions': 5, 'max_interactions': 20}
}

customers_df['num_interactions'] = customers_df['segment'].apply(
    lambda s: int(np.random.uniform(segment_interaction_params[s]['min_interactions'], 
                                     segment_interaction_params[s]['max_interactions']))
)

total_interactions = customers_df['num_interactions'].sum()
customer_ids_int = np.repeat(customers_df['customer_id'].values, customers_df['num_interactions'].values)
signup_dates_int = np.repeat(customers_df['signup_date'].values, customers_df['num_interactions'].values)
history_days_int = np.repeat(customers_df['history_days'].values, customers_df['num_interactions'].values)

days_offsets = np.random.randint(0, history_days_int + 1, size=total_interactions)

signup_series = pd.Series(signup_dates_int)
event_dates = signup_series + pd.to_timedelta(days_offsets, unit='D')

valid_mask = event_dates <= pd.Timestamp(OBSERVATION_DATE)
customer_ids_int = customer_ids_int[valid_mask]
event_dates = event_dates[valid_mask].tolist()

event_types = ['website_visit', 'email_open', 'email_click', 'support_ticket', 'product_view', 'cart_add']
event_probs = [0.35, 0.20, 0.10, 0.05, 0.20, 0.10]
actual_interaction_count = len(event_dates)
selected_events = np.random.choice(event_types, size=actual_interaction_count, p=event_probs)

interactions_df = pd.DataFrame({
    'interaction_id': range(1, actual_interaction_count + 1),
    'customer_id': customer_ids_int,
    'event_date': event_dates,
    'event_type': selected_events
})

print(f"Generated {len(interactions_df)} customer interactions")
interactions_df.head(10)

## Save Raw Data to Snowflake

In [None]:
customers_df.columns = customers_df.columns.str.upper()
transactions_df.columns = transactions_df.columns.str.upper()
interactions_df.columns = interactions_df.columns.str.upper()

customers_sf_df = session.create_dataframe(customers_df)
customers_sf_df.write.mode('overwrite').save_as_table('CONTINUOUS_CUSTOMERS_PROFILE')
print(f"✓ Saved CONTINUOUS_CUSTOMERS_PROFILE: {customers_sf_df.count()} rows")

transactions_sf_df = session.create_dataframe(transactions_df)
transactions_sf_df.write.mode('overwrite').save_as_table('CONTINUOUS_TRANSACTIONS')
print(f"✓ Saved CONTINUOUS_TRANSACTIONS: {transactions_sf_df.count()} rows")

interactions_sf_df = session.create_dataframe(interactions_df)
interactions_sf_df.write.mode('overwrite').save_as_table('CONTINUOUS_INTERACTIONS')
print(f"✓ Saved CONTINUOUS_INTERACTIONS: {interactions_sf_df.count()} rows")

print(f"\n✓ All raw data tables saved to {DATABASE}.{SCHEMA}")

## Verify Tables in Snowflake

In [None]:
tables = session.sql(f"""
    SELECT table_name, row_count 
    FROM {DATABASE}.INFORMATION_SCHEMA.TABLES 
    WHERE table_schema = '{SCHEMA}' 
    AND table_name LIKE 'CONTINUOUS%'
    ORDER BY table_name
""").to_pandas()

print("\nCreated tables:")
print(tables)

print("\nSample data from CONTINUOUS_CUSTOMERS_PROFILE:")
session.table('CONTINUOUS_CUSTOMERS_PROFILE').limit(10).show()

print("\nSample data from CONTINUOUS_TRANSACTIONS:")
session.table('CONTINUOUS_TRANSACTIONS').limit(10).show()

print("\nSample data from CONTINUOUS_INTERACTIONS:")
session.table('CONTINUOUS_INTERACTIONS').limit(10).show()