In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Set seed for reproducibility
random.seed(42)
np.random.seed(42)

# Simulation settings
n_users = 1000
start_date = datetime(2025, 7, 1)
event_steps = ['visit', 'signup', 'try_feature', 'purchase']

# Helper function to simulate user journey
def simulate_user_journey(user_id):
    events = []
    current_time = start_date + timedelta(days=random.randint(0, 15), hours=random.randint(0, 23), minutes=random.randint(0, 59))

    for i, step in enumerate(event_steps):
        # Simulate drop-off: each step has a decreasing chance of happening
        if random.random() < (0.9 - i * 0.2):  # e.g., visit 90%, signup 70%, try_feature 50%, purchase 30%
            events.append({
                'user_id': user_id,
                'event': step,
                'timestamp': current_time
            })
            # Move time forward randomly between steps
            current_time += timedelta(minutes=random.randint(1, 20))
        else:
            break
    return events

# Generate data for all users
all_events = []
for uid in range(1, n_users + 1):
    all_events.extend(simulate_user_journey(uid))

# Convert to DataFrame
df = pd.DataFrame(all_events)

# Show sample and summary
df.head(), df['event'].value_counts()


(   user_id   event           timestamp
 0        1   visit 2025-07-04 00:47:00
 1        1  signup 2025-07-04 00:55:00
 2        2   visit 2025-07-03 18:27:00
 3        2  signup 2025-07-03 18:30:00
 4        3   visit 2025-07-07 22:41:00,
 event
 visit          915
 signup         649
 try_feature    334
 purchase       113
 Name: count, dtype: int64)

In [4]:
df.to_csv('user_journey_data.csv', index=False)