In [None]:

import pandas as pd


print("Loading dataset...")
df = pd.read_csv('funnel_data.csv', parse_dates=['timestamp'])


print("Sorting data...")
df = df.sort_values(['user_id', 'timestamp'])


funnel_stages = [
    'landing_page',
    'product_view',
    'add_to_cart',
    'checkout_start',
    'purchase_complete'
]
stage_map = {stage: i+1 for i, stage in enumerate(funnel_stages)}
df['stage_num'] = df['event'].map(stage_map)


print("Calculating completion status...")
df['max_stage'] = df.groupby('user_id')['stage_num'].transform('max')


print("Calculating time differences...")
df['time_to_next'] = df.groupby('user_id')['timestamp'].diff().shift(-1)


df.to_csv('processed_funnel_data.csv', index=False)
print("Preprocessing complete!")
print(f"Processed data shape: {df.shape}")
print(df.head())

Loading dataset...
Sorting data...
Calculating completion status...
Calculating time differences...
Preprocessing complete!
Processed data shape: (11495, 9)
    user_id         event           timestamp   device  traffic_source  \
0    user_1  landing_page 2023-05-21 04:38:00   Tablet           Email   
1    user_1  product_view 2023-05-21 05:00:00   Tablet           Email   
2    user_1   add_to_cart 2023-05-21 05:19:00   Tablet           Email   
21  user_10  landing_page 2023-06-10 16:07:00  Desktop  Organic Search   
22  user_10  product_view 2023-06-10 16:52:00  Desktop  Organic Search   

   country  stage_num  max_stage    time_to_next  
0       CA          1          3 0 days 00:22:00  
1       CA          2          3 0 days 00:19:00  
2       CA          3          3             NaT  
21      AU          1          5 0 days 00:45:00  
22      AU          2          5 0 days 00:19:00  
