In [None]:
from datasets import load_dataset
import pandas as pd

# Always show all columns when inspecting
pd.set_option("display.max_columns", None)

ds = load_dataset("criteo/criteo-attribution-dataset")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['timestamp', 'uid', 'campaign', 'conversion', 'conversion_timestamp', 'conversion_id', 'attribution', 'click', 'click_pos', 'click_nb', 'cost', 'cpo', 'time_since_last_click', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9'],
        num_rows: 16468027
    })
})


In [None]:
# convert to pandas for easier exploration
df = ds["train"].to_pandas()
print(df.head())

   timestamp       uid  campaign  conversion  conversion_timestamp  \
0          0  20073966  22589171           0                    -1   
1          2  24607497    884761           0                    -1   
2          2  28474333  18975823           0                    -1   
3          3   7306395  29427842           1               1449193   
4          3  25357769  13365547           0                    -1   

   conversion_id  attribution  click  click_pos  click_nb  ...  \
0             -1            0      0         -1        -1  ...   
1             -1            0      0         -1        -1  ...   
2             -1            0      0         -1        -1  ...   
3        3063962            0      1          0         7  ...   
4             -1            0      0         -1        -1  ...   

   time_since_last_click      cat1      cat2      cat3      cat4      cat5  \
0                     -1   5824233   9312274   3490278  29196072  11409686   
1                 423858  

In [9]:
# drop the categorical features,time_since_last_click,click_pos,click_nb 
df_clean = df.drop(columns=['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'time_since_last_click','click_pos', 'click_nb'])
print(df_clean.head())

   timestamp       uid  campaign  conversion  conversion_timestamp  \
0          0  20073966  22589171           0                    -1   
1          2  24607497    884761           0                    -1   
2          2  28474333  18975823           0                    -1   
3          3   7306395  29427842           1               1449193   
4          3  25357769  13365547           0                    -1   

   conversion_id  attribution  click      cost       cpo  
0             -1            0      0  0.000010  0.390794  
1             -1            0      0  0.000010  0.059600  
2             -1            0      0  0.000183  0.149706  
3        3063962            0      1  0.000094  0.154785  
4             -1            0      0  0.000032  0.037583  


In [10]:
# convert negative values to null
cols = ['conversion_timestamp','conversion_id']
df_clean[cols] = df_clean[cols].mask(df_clean[cols]<0)
nulls = df_clean[cols].isna().sum()
negs = df[cols].lt(0).sum()
print(f"sanity checks\n nulls:\n{nulls}\n negs:\n{negs}")

sanity checks
 nulls:
conversion_timestamp    15661831
conversion_id           15661831
dtype: int64
 negs:
conversion_timestamp    15661831
conversion_id           15661831
dtype: int64


In [11]:
# datatype conversion
bool_cols = ["click","conversion","attribution"]
df_clean[bool_cols] = df_clean[bool_cols].astype("boolean")
int_cols = ['conversion_timestamp','conversion_id']
df_clean[int_cols] = df_clean[int_cols].astype("Int64")
print(df_clean.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16468027 entries, 0 to 16468026
Data columns (total 10 columns):
 #   Column                Dtype  
---  ------                -----  
 0   timestamp             int64  
 1   uid                   int64  
 2   campaign              int64  
 3   conversion            boolean
 4   conversion_timestamp  Int64  
 5   conversion_id         Int64  
 6   attribution           boolean
 7   click                 boolean
 8   cost                  float64
 9   cpo                   float64
dtypes: Int64(2), boolean(3), float64(2), int64(3)
memory usage: 1005.1 MB
None


In [None]:
display(df_clean.head())

   timestamp       uid  campaign  conversion  conversion_timestamp  \
0          0  20073966  22589171       False                  <NA>   
1          2  24607497    884761       False                  <NA>   
2          2  28474333  18975823       False                  <NA>   
3          3   7306395  29427842        True               1449193   
4          3  25357769  13365547       False                  <NA>   

   conversion_id  attribution  click      cost       cpo  
0           <NA>        False  False  0.000010  0.390794  
1           <NA>        False  False  0.000010  0.059600  
2           <NA>        False  False  0.000183  0.149706  
3        3063962        False   True  0.000094  0.154785  
4           <NA>        False  False  0.000032  0.037583  
