In [92]:
from datasets import load_dataset
import pandas as pd

# Always show all columns when inspecting
pd.set_option("display.max_columns", None)

ds = load_dataset("criteo/criteo-attribution-dataset")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['timestamp', 'uid', 'campaign', 'conversion', 'conversion_timestamp', 'conversion_id', 'attribution', 'click', 'click_pos', 'click_nb', 'cost', 'cpo', 'time_since_last_click', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9'],
        num_rows: 16468027
    })
})


In [83]:
# convert to pandas for easier exploration
df = ds["train"].to_pandas()
display(df.head())


Unnamed: 0,timestamp,uid,campaign,conversion,conversion_timestamp,conversion_id,attribution,click,click_pos,click_nb,...,time_since_last_click,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9
0,0,20073966,22589171,0,-1,-1,0,0,-1,-1,...,-1,5824233,9312274,3490278,29196072,11409686,1973606,25162884,29196072,29196072
1,2,24607497,884761,0,-1,-1,0,0,-1,-1,...,423858,30763035,9312274,14584482,29196072,11409686,1973606,22644417,9312274,21091111
2,2,28474333,18975823,0,-1,-1,0,0,-1,-1,...,8879,138937,9312274,10769841,29196072,5824237,138937,1795451,29196072,15351056
3,3,7306395,29427842,1,1449193,3063962,0,1,0,7,...,-1,28928366,26597095,12435261,23549932,5824237,1973606,9180723,29841067,29196072
4,3,25357769,13365547,0,-1,-1,0,0,-1,-1,...,-1,138937,26597094,31616034,29196072,11409684,26597096,4480345,29196072,29196072


In [85]:
print("\nShape:", df.shape)
print("\nData Types:\n", df.dtypes)


Shape: (16468027, 22)

Data Types:
 timestamp                  int64
uid                        int64
campaign                   int64
conversion                 int64
conversion_timestamp       int64
conversion_id              int64
attribution                int64
click                      int64
click_pos                  int64
click_nb                   int64
cost                     float64
cpo                      float64
time_since_last_click      int64
cat1                       int64
cat2                       int64
cat3                       int64
cat4                       int64
cat5                       int64
cat6                       int64
cat7                       int64
cat8                       int64
cat9                       int64
dtype: object


In [88]:
print("Missing Values:\n", df.isna().sum().sort_values(ascending=False).head(20))

Missing Values:
 timestamp                0
uid                      0
campaign                 0
conversion               0
conversion_timestamp     0
conversion_id            0
attribution              0
click                    0
click_pos                0
click_nb                 0
cost                     0
cpo                      0
time_since_last_click    0
cat1                     0
cat2                     0
cat3                     0
cat4                     0
cat5                     0
cat6                     0
cat7                     0
dtype: int64


In [89]:
# Core flags distribution 
for col in ["click", "conversion","attribution"]:
    print(f"\nValue counts for {col}:\n")
    print(df[col].value_counts(dropna =False))


Value counts for click:

click
0    10520464
1     5947563
Name: count, dtype: int64

Value counts for conversion:

conversion
0    15661831
1      806196
Name: count, dtype: int64

Value counts for attribution:

attribution
0    16025603
1      442424
Name: count, dtype: int64


In [91]:
# check timestamp range for batch processing window
print("\nTimestamp range:")
print(f"Min timestamp:{df['timestamp'].min()}")
print(f"Max timestamp:{df['timestamp'].max()}")


Timestamp range:
Min timestamp:0
Max timestamp:2671199


In [90]:
print("\nColumns:")
print(list(df.columns))


Columns:
['timestamp', 'uid', 'campaign', 'conversion', 'conversion_timestamp', 'conversion_id', 'attribution', 'click', 'click_pos', 'click_nb', 'cost', 'cpo', 'time_since_last_click', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']
