In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Reproducibility
np.random.seed(1170)
random.seed(1170)

n = 1000
today = datetime(2025, 6, 14)

# 1. Customer IDs
customer_ids = np.arange(1, n + 1)

# 2. Signup dates: random over last ~3 years
signup_dates = [today - timedelta(days=int(np.random.uniform(0, 3 * 365))) for _ in range(n)]

# 3. Plan type with probabilities
plan_types = np.random.choice(['free', 'basic', 'premium', 'enterprise'], size=n, p=[0.4, 0.3, 0.2, 0.1])

# 4. Last login dates based on plan type
last_login_dates = []
for i in range(n):
    sd = signup_dates[i]
    plan = plan_types[i]
    prob_active = {'free': 0.75, 'basic': 0.85, 'premium': 0.92, 'enterprise': 0.96}[plan]
    if np.random.rand() < prob_active:
        login_gap = int(np.random.exponential(scale=60))  # average of 60 days between last login
        login_date = min(sd + timedelta(days=login_gap), today)
        last_login_dates.append(login_date)
    else:
        last_login_dates.append(None)

# 5. Job post count increases with plan tier
plan_post_lambda = {'free': 1, 'basic': 3, 'premium': 5, 'enterprise': 7}
job_posts = [np.random.poisson(plan_post_lambda[plan]) for plan in plan_types]

# 6. Applications received ~ Poisson, scaled to job posts
applications_received = [posts * np.random.poisson(10) for posts in job_posts]

# 7. Support tickets ~ linked to plan + job_posts
support_ticket_scale = {'free': 0.1, 'basic': 0.3, 'premium': 0.6, 'enterprise': 0.9}
support_tickets = [
    np.random.poisson(support_ticket_scale[plan] * max(posts, 1))
    for plan, posts in zip(plan_types, job_posts)
]

# 8. Churn definition: no login in past 90 days
churn_cutoff = today - timedelta(days=90)
is_churned = [
    True if (ld is None or ld < churn_cutoff) else False
    for ld in last_login_dates
]

# 9. Final DataFrame
df = pd.DataFrame({
    'customer_id': customer_ids,
    'signup_date': signup_dates,
    'last_login_date': last_login_dates,
    'job_posts': job_posts,
    'applications_received': applications_received,
    'support_tickets': support_tickets,
    'plan_type': plan_types,
    'is_churned': is_churned
})

# Save
csv_filename = 'customer_data.csv'
df.to_csv(csv_filename, index=False)

# Display
print("Sample data:")
print(df.head())
print(f"\nSaved to {csv_filename}")
print(f"Churn rate: {df['is_churned'].mean():.2%}")
print(f"Shape: {df.shape}")



Sample data:
   customer_id signup_date last_login_date  job_posts  applications_received  \
0            1  2022-10-17      2022-11-02          1                      5   
1            2  2022-10-04      2023-01-05          5                     75   
2            3  2022-09-21      2023-01-07          6                     42   
3            4  2023-05-24      2023-07-03          6                     72   
4            5  2024-08-31      2024-10-30          5                     45   

   support_tickets plan_type  is_churned  
0                0      free        True  
1                4   premium        True  
2                3   premium        True  
3                3   premium        True  
4                3     basic        True  

Saved to customer_data.csv
Churn rate: 90.00%
Shape: (1000, 8)
