# DeepCTR Torch Criteo Example

In [1]:
import pandas as pd
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import DeepFM

## Step 1: Import model

In [2]:
criteo_data = pd.read_csv("data/criteo_sample.txt")
criteo_data.head(15)

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,,3,260.0,,17668.0,,,33.0,,...,e5ba7672,87c6f83c,,,0429f84b,,3a171ecb,c0d61a5c,,
1,0,,-1,19.0,35.0,30251.0,247.0,1.0,35.0,160.0,...,d4bb7bd8,6fc84bfb,,,5155d8a3,,be7c41b4,ded4aac9,,
2,0,0.0,0,2.0,12.0,2013.0,164.0,6.0,35.0,523.0,...,e5ba7672,675c9258,,,2e01979f,,bcdee96c,6d5d1302,,
3,0,,13,1.0,4.0,16836.0,200.0,5.0,4.0,29.0,...,e5ba7672,52e44668,,,e587c466,,32c7478e,3b183c5c,,
4,0,0.0,0,104.0,27.0,1990.0,142.0,4.0,32.0,37.0,...,e5ba7672,25c88e42,21ddcdc9,b1252a9d,0e8585d2,,32c7478e,0d4a6d1a,001f3601,92c878de
5,0,0.0,-1,63.0,40.0,1470.0,61.0,4.0,37.0,46.0,...,e5ba7672,d3303ea5,21ddcdc9,b1252a9d,7633c7c8,,32c7478e,17f458f7,001f3601,71236095
6,0,0.0,370,4.0,1.0,1787.0,65.0,14.0,25.0,489.0,...,3486227d,642f2610,55dd3565,b1252a9d,5c8dc711,,423fab69,45ab94c8,2bf691b1,c84c4aec
7,1,19.0,10,30.0,10.0,1.0,3.0,33.0,47.0,126.0,...,e5ba7672,a78bd508,21ddcdc9,5840adea,c2a93b37,,32c7478e,1793a828,e8b83407,2fede552
8,0,0.0,0,36.0,22.0,4684.0,217.0,9.0,35.0,135.0,...,e5ba7672,7ce63c71,,,af5dc647,,dbb486d7,1793a828,,
9,0,2.0,11,8.0,23.0,30.0,11.0,2.0,8.0,23.0,...,07c540c4,c21c3e4c,21ddcdc9,a458ea53,31c8e642,,c7dc6720,3e983c86,9b3e8820,d597922b


In [3]:
criteo_data.describe()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13
count,200.0,110.0,200.0,166.0,165.0,194.0,149.0,190.0,200.0,190.0,110.0,190.0,43.0,165.0
mean,0.245,2.318182,103.69,42.542169,8.775758,16741.190722,132.033557,12.768421,12.6,111.389474,0.554545,2.436842,0.534884,11.618182
std,0.431166,4.682452,423.37541,227.290139,11.193068,54282.460626,287.172855,33.757289,13.261574,173.446898,0.671659,3.893246,1.241137,15.731541
min,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,3.0,2.0,213.75,9.0,0.25,2.0,10.0,0.0,0.0,0.0,2.0
50%,0.0,1.0,2.5,6.0,5.0,2066.0,37.0,3.5,7.0,45.5,0.0,1.0,0.0,5.0
75%,0.0,2.0,29.25,22.75,12.0,10710.0,112.0,11.0,19.0,122.75,1.0,3.0,1.0,16.0
max,1.0,37.0,3001.0,2815.0,87.0,507333.0,2106.0,301.0,49.0,1034.0,3.0,32.0,7.0,102.0


In [4]:
criteo_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 40 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   200 non-null    int64  
 1   I1      110 non-null    float64
 2   I2      200 non-null    int64  
 3   I3      166 non-null    float64
 4   I4      165 non-null    float64
 5   I5      194 non-null    float64
 6   I6      149 non-null    float64
 7   I7      190 non-null    float64
 8   I8      200 non-null    float64
 9   I9      190 non-null    float64
 10  I10     110 non-null    float64
 11  I11     190 non-null    float64
 12  I12     43 non-null     float64
 13  I13     165 non-null    float64
 14  C1      200 non-null    object 
 15  C2      200 non-null    object 
 16  C3      191 non-null    object 
 17  C4      191 non-null    object 
 18  C5      200 non-null    object 
 19  C6      168 non-null    object 
 20  C7      200 non-null    object 
 21  C8      200 non-null    object 
 22  C9

## Step 2: Simple preprocessing

In [5]:
sparse_features = ["C" + str(i) for i in range(1, 27)]
dense_features = ["I" + str(i) for i in range(1, 14)]

# Fill sparse features with -1
criteo_data[sparse_features] = criteo_data[sparse_features].fillna(
    "-1",
)
# Fill dense features with 0
criteo_data[dense_features] = criteo_data[dense_features].fillna(
    0,
)
target = ["label"]

# 1.Label Encoding for sparse features, and do simple Transformation for dense features
for feat in sparse_features:
    lbe = LabelEncoder()
    criteo_data[feat] = lbe.fit_transform(criteo_data[feat])
    
mms = MinMaxScaler(feature_range=(0, 1))
criteo_data[dense_features] = mms.fit_transform(criteo_data[dense_features])

In [6]:
criteo_data.head(15)

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,0.0,0.001332,0.092362,0.0,0.034825,0.0,0.0,0.673469,0.0,...,8,66,0,0,3,0,1,96,0,0
1,0,0.0,0.0,0.00675,0.402299,0.059628,0.117284,0.003322,0.714286,0.154739,...,7,52,0,0,47,0,7,112,0,0
2,0,0.0,0.000333,0.00071,0.137931,0.003968,0.077873,0.019934,0.714286,0.505803,...,8,49,0,0,25,0,6,53,0,0
3,0,0.0,0.004664,0.000355,0.045977,0.033185,0.094967,0.016611,0.081633,0.028046,...,8,37,0,0,156,0,0,32,0,0
4,0,0.0,0.000333,0.036945,0.310345,0.003922,0.067426,0.013289,0.653061,0.035783,...,8,14,5,3,9,0,0,5,1,47
5,0,0.0,0.0,0.02238,0.45977,0.002898,0.028965,0.013289,0.755102,0.044487,...,8,105,5,3,77,0,0,13,1,34
6,0,0.0,0.123584,0.001421,0.011494,0.003522,0.030864,0.046512,0.510204,0.472921,...,4,46,18,3,58,0,2,41,3,71
7,1,0.513514,0.003664,0.010657,0.114943,2e-06,0.001425,0.109635,0.959184,0.121857,...,8,80,5,1,128,0,0,12,16,17
8,0,0.0,0.000333,0.012789,0.252874,0.009233,0.103039,0.0299,0.714286,0.130561,...,8,57,0,0,113,0,9,12,0,0
9,0,0.054054,0.003997,0.002842,0.264368,5.9e-05,0.005223,0.006645,0.163265,0.022244,...,0,95,5,2,28,0,8,34,11,74


## Step 3: Generate feature columns

In [7]:
# 2.count #unique features for each sparse field,and record dense feature field name
fixlen_feature_columns = [
    SparseFeat(feat, vocabulary_size=criteo_data[feat].nunique(), embedding_dim=4)
    for i, feat in enumerate(sparse_features)
] + [
    DenseFeat(
        feat,
        1,
    )
    for feat in dense_features
]

for feature in fixlen_feature_columns:
    print(f"FEATURE: {feature}")

FEATURE: SparseFeat(name='C1', vocabulary_size=27, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C1', group_name='default_group')
FEATURE: SparseFeat(name='C2', vocabulary_size=92, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C2', group_name='default_group')
FEATURE: SparseFeat(name='C3', vocabulary_size=172, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C3', group_name='default_group')
FEATURE: SparseFeat(name='C4', vocabulary_size=157, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C4', group_name='default_group')
FEATURE: SparseFeat(name='C5', vocabulary_size=12, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C5', group_name='default_group')
FEATURE: SparseFeat(name='C6', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='C6', group_name='default_group')
FEATURE: SparseFeat(name='C7', vocabulary_size=183, embedding_dim=4, use_hash=False, dtype='int32', embedding_n

In [8]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
print(f"Feature names: {feature_names}")

Feature names: ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13']


**Feature names:** ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13']

## Step 4: Generate the training samples and train the model

In [9]:
train, test = train_test_split(criteo_data, test_size=0.2, random_state=1773)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

In [10]:
print(train_model_input)

{'C1': 83     17
46      0
98      0
150    11
151    25
       ..
32      0
197     0
164     0
175     0
51     14
Name: C1, Length: 160, dtype: int64, 'C2': 83     67
46     35
98     30
150    15
151    50
       ..
32     27
197     5
164    57
175    76
51     18
Name: C2, Length: 160, dtype: int64, 'C3': 83     152
46     102
98      83
150    118
151    160
      ... 
32     155
197    153
164     14
175    146
51      71
Name: C3, Length: 160, dtype: int64, 'C4': 83       3
46      67
98     113
150    135
151      6
      ... 
32      69
197    143
164     38
175    117
51     130
Name: C4, Length: 160, dtype: int64, 'C5': 83     1
46     6
98     5
150    6
151    1
      ..
32     7
197    1
164    3
175    1
51     1
Name: C5, Length: 160, dtype: int64, 'C6': 83     4
46     3
98     0
150    0
151    2
      ..
32     6
197    0
164    4
175    4
51     4
Name: C6, Length: 160, dtype: int64, 'C7': 83     132
46     151
98      47
150    175
151    147
      ... 
32      3

In [16]:
use_cuda = True
if use_cuda and torch.cuda.is_available():
    device = "cuda:0"
    print(f"PyTorch: Cuda ready, device={device}")
else:
    device = "cpu"

model = DeepFM(
    linear_feature_columns=linear_feature_columns,
    dnn_feature_columns=dnn_feature_columns,
    task="binary",
    l2_reg_embedding=1e-5,
    device=device,
)
model.compile("adagrad", "binary_crossentropy", metrics=["binary_crossentropy", "auc"])

history = model.fit(
    train_model_input,
    train[target].values,
    batch_size=32,
    epochs=10,
    verbose=2,
    validation_split=0.2,
)

PyTorch: Cuda ready, device=cuda:0
cuda:0
Train on 128 samples, validate on 32 samples, 4 steps per epoch
Epoch 1/10
0s - loss:  0.6516 - binary_crossentropy:  0.6516 - auc:  0.4290 - val_binary_crossentropy:  0.5785 - val_auc:  0.6042
Epoch 2/10
0s - loss:  0.5137 - binary_crossentropy:  0.5137 - auc:  0.9621 - val_binary_crossentropy:  0.5494 - val_auc:  0.6406
Epoch 3/10
0s - loss:  0.4001 - binary_crossentropy:  0.4001 - auc:  0.9792 - val_binary_crossentropy:  0.5627 - val_auc:  0.5781
Epoch 4/10
0s - loss:  0.2507 - binary_crossentropy:  0.2507 - auc:  0.9932 - val_binary_crossentropy:  0.5799 - val_auc:  0.5833
Epoch 5/10
0s - loss:  0.1472 - binary_crossentropy:  0.1472 - auc:  1.0000 - val_binary_crossentropy:  0.6124 - val_auc:  0.5729
Epoch 6/10
0s - loss:  0.0945 - binary_crossentropy:  0.0944 - auc:  1.0000 - val_binary_crossentropy:  0.6666 - val_auc:  0.5781
Epoch 7/10
0s - loss:  0.0684 - binary_crossentropy:  0.0684 - auc:  0.9988 - val_binary_crossentropy:  0.6803 - v

In [17]:
pred_ans = model.predict(test_model_input, batch_size=256)

print(
    f"TEST: BCE Loss: {round(log_loss(test[target].values, pred_ans), 4)} | ROC AUC: {round(roc_auc_score(test[target].values, pred_ans), 4)} "
)

TEST: BCE Loss: 0.4666 | ROC AUC: 0.6623 
