In [None]:
# 📄 Customer Churn Dataset Generator

Generates synthetic customer data for a hosting/domain service business with churn labels driven by configurable behavioral patterns.

## Features
- Pattern-based churn logic (not random)
- Realistic data distributions
- Configurable parameters
- Fast generation (<2s for 100 records)


In [None]:
# Install dependencies if needed
# !pip install pandas numpy faker

import pandas as pd
import numpy as np
from faker import Faker
import uuid
from datetime import datetime, timedelta
import warnings
import os
warnings.filterwarnings('ignore')

print("✅ Dependencies loaded successfully!")


In [None]:
## 🔧 Configuration

Modify these parameters to customize your dataset:


In [None]:
# Configuration
N_CUSTOMERS = 100  # Number of customer records
RANDOM_SEED = 42   # Set to None for random results, or integer for reproducible results
OUTPUT_FILE = 'churn_dataset_realistic.csv'

# Set seeds for reproducibility
if RANDOM_SEED is not None:
    np.random.seed(RANDOM_SEED)
    Faker.seed(RANDOM_SEED)

fake = Faker()
print(f"📊 Configured to generate {N_CUSTOMERS} records with seed={RANDOM_SEED}")


In [None]:
## 🎲 Data Generation Functions


In [None]:
def generate_basic_info(n_customers):
    """Generate basic customer information."""
    return {
        'customer_id': [str(uuid.uuid4()) for _ in range(n_customers)],
        'customer_name': [fake.name() for _ in range(n_customers)],
        'customer_email': [fake.email() for _ in range(n_customers)]
    }

def generate_support_metrics(n_customers):
    """Generate support-related metrics."""
    support_tickets = np.random.poisson(lam=2, size=n_customers)
    support_tickets = np.clip(support_tickets, 0, 10)
    
    # Resolution time correlates with ticket volume
    base_resolution = np.random.exponential(scale=24, size=n_customers)
    ticket_multiplier = 1 + (support_tickets / 10)
    avg_resolution_time = base_resolution * ticket_multiplier
    avg_resolution_time = np.clip(avg_resolution_time, 1, 200)
    
    # Critical tickets are subset of total tickets
    critical_tickets_sla_breach = np.random.binomial(
        n=np.minimum(support_tickets, 4), 
        p=0.3, 
        size=n_customers
    )
    
    return {
        'support_tickets': support_tickets,
        'avg_resolution_time': np.round(avg_resolution_time, 2),
        'critical_tickets_sla_breach': critical_tickets_sla_breach
    }

def generate_product_metrics(n_customers):
    """Generate product and subscription metrics."""
    # Tenure follows realistic distribution
    tenure_months = np.random.gamma(shape=2, scale=8, size=n_customers)
    tenure_months = np.clip(tenure_months, 1, 48).astype(int)
    
    # Product renewals depend on tenure
    renewal_probability = np.minimum(tenure_months / 24, 0.8)
    product_renewals = np.random.binomial(n=5, p=renewal_probability)
    
    # Monthly spend follows log-normal distribution
    monthly_spend = np.random.lognormal(mean=3.5, sigma=0.8, size=n_customers)
    monthly_spend = np.clip(monthly_spend, 5, 150)
    
    # Total products correlates with spend
    spend_tier = np.digitize(monthly_spend, bins=[0, 25, 50, 100, 150])
    total_products = np.random.poisson(lam=spend_tier, size=n_customers)
    total_products = np.clip(total_products, 1, 10)
    
    # Products transferred out (churn indicator)
    transfer_probability = np.random.beta(a=1, b=9, size=n_customers)
    products_transferred_out = np.random.binomial(
        n=np.minimum(total_products, 3), 
        p=transfer_probability
    )
    
    return {
        'product_renewals': product_renewals,
        'tenure_months': tenure_months,
        'monthly_spend': np.round(monthly_spend, 2),
        'total_products': total_products,
        'products_transferred_out': products_transferred_out
    }

def generate_performance_metrics(n_customers):
    """Generate system performance metrics."""
    # Load time follows log-normal (most fast, some very slow)
    avg_load_time = np.random.lognormal(mean=1.2, sigma=0.8, size=n_customers)
    avg_load_time = np.clip(avg_load_time, 0.5, 15)
    
    # Downtime follows exponential distribution
    downtime_minutes = np.random.exponential(scale=30, size=n_customers)
    downtime_minutes = np.clip(downtime_minutes, 0, 300).astype(int)
    
    # Product usage - beta distribution creates realistic usage patterns
    product_usage_percent = np.random.beta(a=2, b=2, size=n_customers) * 100
    product_usage_percent = np.clip(product_usage_percent, 0, 100)
    
    return {
        'avg_load_time': np.round(avg_load_time, 2),
        'downtime_minutes': downtime_minutes,
        'product_usage_percent': np.round(product_usage_percent, 1)
    }

def generate_last_login(tenure_months):
    """Generate last login dates based on tenure."""
    today = datetime.now()
    last_logins = []
    
    for tenure in tenure_months:
        # Recent customers login more frequently
        if tenure < 6:
            days_ago = np.random.exponential(scale=10)
        elif tenure < 12:
            days_ago = np.random.exponential(scale=20)
        else:
            days_ago = np.random.exponential(scale=35)
        
        days_ago = min(days_ago, tenure * 30)  # Can't login before becoming customer
        last_login = today - timedelta(days=int(days_ago))
        last_logins.append(last_login.strftime('%Y-%m-%d'))
    
    return last_logins

print("✅ Data generation functions defined!")


In [None]:
## 🎯 Churn Logic

This implements the rule-based scoring system from the PRD:


In [None]:
def calculate_churn_score(df):
    """Calculate churn score based on rule-based logic from PRD."""
    scores = np.zeros(len(df))
    
    # Calculate days since last login
    today = datetime.now()
    last_login_dates = pd.to_datetime(df['last_login'])
    days_since_login = (today - last_login_dates).dt.days
    
    # Apply scoring rules from PRD
    conditions = [
        ('Support tickets > 4', df['support_tickets'] > 4),
        ('Avg resolution time > 48h', df['avg_resolution_time'] > 48),
        ('Critical SLA breaches > 1', df['critical_tickets_sla_breach'] > 1),
        ('Product renewals < 2', df['product_renewals'] < 2),
        ('Tenure < 6 months', df['tenure_months'] < 6),
        ('Monthly spend < $20', df['monthly_spend'] < 20),
        ('Last login > 45 days ago', days_since_login > 45),
        ('Products transferred out > 1', df['products_transferred_out'] > 1),
        ('Avg load time > 5s', df['avg_load_time'] > 5),
        ('Downtime > 120 min', df['downtime_minutes'] > 120),
        ('Product usage < 30%', df['product_usage_percent'] < 30)
    ]
    
    # Sum up the scores and show breakdown
    print("🎯 Churn Scoring Rules Applied:")
    for rule_name, condition in conditions:
        rule_score = condition.sum()
        scores += condition.astype(int)
        print(f"   {rule_name}: {rule_score} customers affected")
    
    return scores

def assign_churn_status(scores):
    """Assign churn status based on scores with 10% noise."""
    # Base assignment: score >= 5 means inactive
    base_status = ['inactive' if score >= 5 else 'active' for score in scores]
    
    # Add 10% noise (random flips)
    noise_indices = np.random.choice(
        len(base_status), 
        size=int(0.1 * len(base_status)), 
        replace=False
    )
    
    final_status = base_status.copy()
    for idx in noise_indices:
        final_status[idx] = 'inactive' if final_status[idx] == 'active' else 'active'
    
    print(f"\n📊 Churn Assignment Summary:")
    print(f"   Base rule assignment: {base_status.count('inactive')} inactive / {len(base_status)} total")
    print(f"   After 10% noise: {final_status.count('inactive')} inactive / {len(final_status)} total")
    
    return final_status

print("✅ Churn logic functions defined!")


In [None]:
## 🚀 Generate Dataset

Run this cell to generate your synthetic churn dataset:


In [None]:
%%time

print(f"🎲 Generating {N_CUSTOMERS} customer records...\n")

# Generate all data components
basic_info = generate_basic_info(N_CUSTOMERS)
support_metrics = generate_support_metrics(N_CUSTOMERS)
product_metrics = generate_product_metrics(N_CUSTOMERS)
performance_metrics = generate_performance_metrics(N_CUSTOMERS)

# Combine into DataFrame
data = {**basic_info, **support_metrics, **product_metrics, **performance_metrics}
df = pd.DataFrame(data)

# Generate last login based on tenure
df['last_login'] = generate_last_login(df['tenure_months'])

# Calculate churn scores and assign status
churn_scores = calculate_churn_score(df)
df['customer_status'] = assign_churn_status(churn_scores)

# Reorder columns to match PRD schema
column_order = [
    'customer_id', 'customer_name', 'customer_email',
    'support_tickets', 'avg_resolution_time', 'critical_tickets_sla_breach',
    'product_renewals', 'tenure_months', 'monthly_spend',
    'last_login', 'total_products', 'products_transferred_out',
    'avg_load_time', 'downtime_minutes', 'product_usage_percent',
    'customer_status'
]

df_final = df[column_order]

# Save to CSV
df_final.to_csv(OUTPUT_FILE, index=False)

print(f"\n✅ Dataset generated and saved to: {OUTPUT_FILE}")
print(f"📊 Shape: {df_final.shape}")
print(f"📈 Churn rate: {(df_final['customer_status'] == 'inactive').mean():.1%}")

# Preview first 5 rows
print(f"\n📋 Preview (first 5 rows):")
display(df_final.head())
