Data Generation (Mock Data)

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

np.random.seed(42)

def generate_mock_dataset(n=100000):
    now = pd.Timestamp.now()
    start_date = now - pd.Timedelta(days=30)
    
    data = {
        'user_id': [f'U{i:06d}' for i in range(1, n+1)],
        'cart_abandoned_date': pd.to_datetime(np.random.choice(
            pd.date_range(start_date, now), size=n)),
        'last_order_date': pd.to_datetime(np.random.choice(
            pd.date_range(start_date - pd.Timedelta(days=90), now), size=n)),
        'avg_order_value': np.random.lognormal(mean=7.5, sigma=0.8, size=n),  # Skewed like real AOV
        'sessions_last_30d': np.random.poisson(lam=5, size=n),
        'num_cart_items': np.random.randint(1, 10, size=n),
        'engagement_score': np.clip(np.random.normal(0.5, 0.2, size=n), 0.0, 1.0),
        'profitability_score': np.clip(np.random.normal(0.6, 0.15, size=n), 0.0, 1.0)
    }
    
    df = pd.DataFrame(data)
    # Ensure some cart abandonments in last 7 days
    recent_abandoners = df.sample(frac=0.3, random_state=42).index
    df.loc[recent_abandoners, 'cart_abandoned_date'] = np.random.choice(
        pd.date_range(now - pd.Timedelta(days=7), now), size=len(recent_abandoners))
    
    return df

# Generate and save
df = generate_mock_dataset(100000)
df.to_csv("mock_cart_abandoners.csv", index=False)
print("✅ Mock dataset generated: 100,000 rows")
print(df.head())

✅ Mock dataset generated: 100,000 rows
   user_id        cart_abandoned_date            last_order_date  \
0  U000001 2025-08-27 20:30:42.005843 2025-06-26 20:30:42.005843   
1  U000002 2025-09-09 20:30:42.005843 2025-09-17 20:30:42.005843   
2  U000003 2025-09-18 20:30:42.005843 2025-07-08 20:30:42.005843   
3  U000004 2025-09-04 20:30:42.005843 2025-07-13 20:30:42.005843   
4  U000005 2025-08-31 20:30:42.005843 2025-07-30 20:30:42.005843   

   avg_order_value  sessions_last_30d  num_cart_items  engagement_score  \
0      2064.483703                  9               3          0.136407   
1      5194.792963                  2               3          0.277935   
2       656.226466                  7               5          0.294941   
3      1222.339164                  7               6          0.338720   
4      1599.186873                  3               2          0.365381   

   profitability_score  
0             0.633541  
1             0.567404  
2             0.640938  
3