In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

In [2]:
data = pd.read_csv("app-beacon-producto-20221020.csv")
data.head()

Unnamed: 0,event_date,session_timestamp,event_wday,user_id,user_customer_id,session_id,ga_session_number,customer_type,session_duration_microseconds,completed_transaction,...,quantity,time,total_cost,entered_dm,entered_purposeful_shelves,entered_store,entered_welcome,visited_digi_me,visited_personalised_store,visited_purposeful_shelves
0,2022-07-22,1658457686684002,Friday,0B907DABFE7A49CFB1CA0B43DEFC8A5E,,1658458000.0,1.0,New Visitor,4448006,0,...,,,,,,,,,,
1,2022-07-22,1658457629805002,Friday,2C1503D8D34246B8B2D875980614A0AA,,1658458000.0,1.0,New Visitor,4426006,0,...,,,,,,,,,,
2,2022-07-22,1658500947730002,Friday,FEB0684ADBFD48DC9A373A679CB338B8,,1658501000.0,1.0,New Visitor,4079006,0,...,,,,,,,,,,
3,2022-07-22,1658500958188002,Friday,721D880E47464D10A0DFEF50D52238BA,,1658501000.0,1.0,New Visitor,4388007,0,...,,,,,,,,,,
4,2022-07-22,1658489302812001,Friday,b1ad626d2506be76b5a74b6383335736,,1658489000.0,1.0,New Visitor,7681056002,0,...,,,,1.0,1.0,1.0,1.0,0.0,0.0,1.0


In [3]:
data.shape

(8512, 37)

In [4]:
# sort by session time, user.
data = data.sort_values(by=['session_timestamp', 'user_id', 'session_id', 'session_duration_microseconds'],
                        ascending=False, na_position='last')

In [5]:
data_dd = data.drop_duplicates(subset=['session_timestamp', 'user_id', 
                                       'session_id', 'session_duration_microseconds'])

In [6]:
# first replace nan's with 0.
data_dd = data_dd.replace({np.nan: 0})
data_dd.shape

(1820, 37)

In [7]:
data_dd.head()

Unnamed: 0,event_date,session_timestamp,event_wday,user_id,user_customer_id,session_id,ga_session_number,customer_type,session_duration_microseconds,completed_transaction,...,quantity,time,total_cost,entered_dm,entered_purposeful_shelves,entered_store,entered_welcome,visited_digi_me,visited_personalised_store,visited_purposeful_shelves
2625,2022-10-09,1665335014882001,Sunday,AB7F5ACDE25841D8BA76B7984F6EC30F,8WWLzutqD5fFQ7aGr40l4Awj9vH2,1665335000.0,4.0,Returning Visitor,6686003,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
250,2022-10-07,1665168886382001,Friday,EC39EB21DB8548C999B0928E26DBF8E0,BAhWxaK8sYVAX11GazwreuG9nAF2,1665169000.0,6.0,Returning Visitor,325003,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,2022-10-07,1665160701104002,Friday,654976C513194D05BE0681398DB14343,w8sHrvUaQvWrFrGIY0o4fQO8bhM2,1665161000.0,1.0,New Visitor,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,2022-10-07,1665160696399002,Friday,654976C513194D05BE0681398DB14343,0,1665161000.0,1.0,New Visitor,466004,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,2022-10-07,1665152901056002,Friday,47084277751B40208460C9DC281B4749,0,1665153000.0,1.0,New Visitor,4442006,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
data_dd.dtypes

event_date                         object
session_timestamp                   int64
event_wday                         object
user_id                            object
user_customer_id                   object
session_id                        float64
ga_session_number                 float64
customer_type                      object
session_duration_microseconds       int64
completed_transaction               int64
signed_up                           int64
transaction_value                 float64
pages_visited                       int64
app_removed                         int64
personalise_store_pageviews         int64
purposeful_shelves_pageviews        int64
digi_me_pageviews                   int64
homepage_duration_microseconds      int64
add_to_cart                         int64
scan_qr                             int64
notification_open                   int64
notification_receive                int64
date                              float64
item_id                           

In [9]:
# profile the data
import datetime
date = datetime.date.today()
profile = ProfileReport(data_dd, title="corner-shop-data-report", minimal=True)
profile.to_file(f"corner-shop-data-profile-{date}.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

###### Statistics to see if people who entered the store are the only ones who made any transaction

In [10]:
# First see all the transactions distribution
data_dd[['session_timestamp', 'completed_transaction']].groupby('completed_transaction').count()

Unnamed: 0_level_0,session_timestamp
completed_transaction,Unnamed: 1_level_1
0,1793
1,27


In [11]:
entered_shop = data_dd[(data_dd['entered_dm'] == 1) |
                      (data_dd['entered_purposeful_shelves'] == 1) |
                      (data_dd['entered_store'] == 1) |
                      (data_dd['entered_welcome'] == 1)]

In [12]:
entered_shop[['session_timestamp', 'completed_transaction']].groupby('completed_transaction').count()

Unnamed: 0_level_0,session_timestamp
completed_transaction,Unnamed: 1_level_1
0,891
1,22


In [13]:
not_entered_shop = data_dd[~data_dd['session_timestamp'].isin(entered_shop['session_timestamp'])]

In [14]:
not_entered_shop[['session_timestamp', 'completed_transaction']].groupby('completed_transaction').count()

Unnamed: 0_level_0,session_timestamp
completed_transaction,Unnamed: 1_level_1
0,902
1,5
