In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta


np.random.seed(42)


n_users = 5000
start_date = datetime(2020, 1, 1)
end_date = datetime(2023, 12, 31)


user_ids = np.arange(1, n_users+1)


signup_dates = [start_date + timedelta(days=np.random.randint(0, (end_date-start_date).days))
                for _ in range(n_users)]


plans = ['Free', 'Basic', 'Pro', 'Enterprise']
plan_probs = [0.4, 0.3, 0.2, 0.1]
user_plans = np.random.choice(plans, size=n_users, p=plan_probs)


plan_revenue = {'Free': 0, 'Basic': 20, 'Pro': 50, 'Enterprise': 120}
revenues = [plan_revenue[plan] for plan in user_plans]


churn_probs = {'Free': 0.6, 'Basic': 0.3, 'Pro': 0.2, 'Enterprise': 0.1}
churned = [np.random.choice([1, 0], p=[churn_probs[plan], 1-churn_probs[plan]]) for plan in user_plans]


sessions = [np.random.poisson(lam=(10 if plan != 'Free' else 3)) for plan in user_plans]


data = pd.DataFrame({
    'user_id': user_ids,
    'signup_date': signup_dates,
    'plan': user_plans,
    'monthly_revenue': revenues,
    'churned': churned,
    'sessions_last_month': sessions
})


data.to_csv("saas_customer_data.csv", index=False)

data.head()


Unnamed: 0,user_id,signup_date,plan,monthly_revenue,churned,sessions_last_month
0,1,2023-01-31,Basic,20,0,12
1,2,2023-12-30,Basic,20,0,14
2,3,2022-05-10,Enterprise,120,0,9
3,4,2023-07-18,Free,0,1,5
4,5,2023-02-04,Free,0,0,1
