In [None]:
# default_exp datasets.criteo

# Criteo
> Criteo dataset.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

## Criteo dataset and datamodule

In [None]:
#export
from recohut.datasets.bases.ctr import *
from recohut.utils.common_utils import download_url

import pandas as pd
import numpy as np
import os
from datetime import datetime, date

In [None]:
#export
class CriteoDataset(CTRDataset):

    feature_cols = [{'name': ['I1','I2','I3','I4','I5','I6','I7','I8','I9','I10','I11','I12','I13'], 
                     'active': True, 'dtype': float, 'type': 'categorical', 'preprocess': 'convert_to_bucket', 'na_value': 0},
                    {'name': ['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14','C15','C16','C17',
                              'C18','C19','C20','C21','C22','C23','C24','C25','C26'],
                     'active': True, 'dtype': str, 'type': 'categorical', 'na_value': ""}]
                        
    label_col = {'name': 'Label', 'dtype': float}

    train_url = "https://github.com/RecoHut-Datasets/criteo/raw/v2/train.csv"
    valid_url = "https://github.com/RecoHut-Datasets/criteo/raw/v2/valid.csv"
    test_url = "https://github.com/RecoHut-Datasets/criteo/raw/v2/test.csv"

    @property
    def raw_file_names(self):
        return ['train.csv',
                'valid.csv',
                'test.csv']

    def download(self):
        download_url(self.train_url, self.raw_dir)
        download_url(self.valid_url, self.raw_dir)
        download_url(self.test_url, self.raw_dir)

    def convert_to_bucket(self, df, col_name):
        def _convert_to_bucket(value):
            if value > 2:
                value = int(np.floor(np.log(value) ** 2))
            else:
                value = int(value)
            return value
        return df[col_name].map(_convert_to_bucket).astype(int)

In [None]:
#export
class CriteoDataModule(CTRDataModule):
    dataset_cls = CriteoDataset

Example

In [None]:
params = {'model_id': 'DCN_demo',
              'data_dir': '/content/data',
              'model_root': './checkpoints/',
              'dnn_hidden_units': [64, 64],
              'dnn_activations': "relu",
              'crossing_layers': 3,
              'learning_rate': 1e-3,
              'net_dropout': 0,
              'batch_norm': False,
              'optimizer': 'adamw',
              'task': 'binary_classification',
              'loss': 'binary_crossentropy',
              'metrics': ['logloss', 'AUC'],
              'embedding_dim': 10,
              'batch_size': 64,
              'epochs': 3,
              'shuffle': True,
              'seed': 2019,
              'use_hdf5': True,
              'workers': 1,
              'verbose': 0}

In [None]:
!rm -r /content/data
ds = CriteoDataModule(**params)
ds.prepare_data()
ds.setup()

for batch in ds.train_dataloader():
    print(batch)
    break

  "DataModule property `train_transforms` was deprecated in v1.5 and will be removed in v1.7."
Downloading https://github.com/RecoHut-Datasets/criteo/raw/v2/train.csv
Downloading https://github.com/RecoHut-Datasets/criteo/raw/v2/valid.csv
Downloading https://github.com/RecoHut-Datasets/criteo/raw/v2/test.csv
Processing...
Done!


[tensor([[  4.,   1.,   2.,  ...,   2.,   0.,   0.],
        [  0.,  16.,   0.,  ...,  52.,   0.,   0.],
        [  0.,   2.,   0.,  ...,   5.,   0.,   0.],
        ...,
        [  0.,  22.,   4.,  ...,   1.,   0.,   0.],
        [  0.,   6.,   1.,  ...,   3.,   1., 202.],
        [  0.,   5.,   2.,  ...,  12.,   2.,   6.]], dtype=torch.float64), tensor([1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0.,
        1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 0., 1.,
        0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 1.,
        1., 0., 0., 0., 0., 1., 0., 1., 1., 1.], dtype=torch.float64)]


In [None]:
from recohut.models.deepcrossing import DeepCrossing

params = {'model_id': 'DeepCrossing',
              'data_dir': '/content/data',
              'model_root': './checkpoints/',
              'dnn_hidden_units': [64, 64],
              'dnn_activations': "relu",
              'learning_rate': 1e-3,
              'net_dropout': 0,
              'batch_norm': False,
              'optimizer': 'adamw',
              'use_residual': True,
              'residual_blocks': [500, 500, 500],
              'task': 'binary_classification',
              'loss': 'binary_crossentropy',
              'metrics': ['logloss', 'AUC'],
              'embedding_dim': 10,
              'batch_size': 64,
              'epochs': 3,
              'shuffle': True,
              'seed': 2019,
              'use_hdf5': True,
              'workers': 1,
              'verbose': 0}

model = DeepCrossing(ds.dataset.feature_map, **params)

In [None]:
from recohut.trainers.pl_trainer import pl_trainer

pl_trainer(model, ds, max_epochs=5)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  f"DataModule.{name} has already been called, so it will not be called again. "

  | Name              | Type           | Params
-----------------------------------------------------
0 | embedding_layer   | EmbeddingLayer | 136 K 
1 | crossing_layer    | Sequential     | 1.2 M 
2 | output_activation | Sigmoid        | 0     
-----------------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.238     Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

[Metrics] logloss: 1.923407 - AUC: 0.512695
[Metrics] logloss: 2.335812 - AUC: 0.374510
[Metrics] logloss: 1.951746 - AUC: 0.510511
[Metrics] logloss: 1.656238 - AUC: 0.495951
[Metrics] logloss: 1.599615 - AUC: 0.457490
[Metrics] logloss: 2.074813 - AUC: 0.432063
[Metrics] logloss: 1.315625 - AUC: 0.600586
[Metrics] logloss: 1.596338 - AUC: 0.548039
[Metrics] logloss: 1.768354 - AUC: 0.580296
[Metrics] logloss: 2.074194 - AUC: 0.433040
[Metrics] logloss: 1.842569 - AUC: 0.550342
[Metrics] logloss: 2.395853 - AUC: 0.432617
[Metrics] logloss: 1.774030 - AUC: 0.462564
[Metrics] logloss: 1.734166 - AUC: 0.557185
[Metrics] logloss: 1.963507 - AUC: 0.422287
[Metrics] logloss: 1.737491 - AUC: 0.504433
[Metrics] logloss: 1.519305 - AUC: 0.607843
[Metrics] logloss: 1.669249 - AUC: 0.491903
[Metrics] logloss: 1.663352 - AUC: 0.552941
[Metrics] logloss: 1.668330 - AUC: 0.539216
[Metrics] logloss: 2.353048 - AUC: 0.385142
[Metrics] logloss: 1.886832 - AUC: 0.502930
[Metrics] logloss: 1.787228 - AU

  f"DataModule.{name} has already been called, so it will not be called again. "


Testing: 0it [00:00, ?it/s]

[Metrics] logloss: 1.830594 - AUC: 0.558162
[Metrics] logloss: 2.161489 - AUC: 0.407738
[Metrics] logloss: 1.679642 - AUC: 0.486275
[Metrics] logloss: 1.562141 - AUC: 0.551320
[Metrics] logloss: 1.594444 - AUC: 0.567460
[Metrics] logloss: 1.626753 - AUC: 0.592118
[Metrics] logloss: 1.529520 - AUC: 0.557635
[Metrics] logloss: 1.329303 - AUC: 0.599600
[Metrics] logloss: 2.305318 - AUC: 0.466010
[Metrics] logloss: 1.942118 - AUC: 0.477539
[Metrics] logloss: 2.328542 - AUC: 0.436453
[Metrics] logloss: 2.221586 - AUC: 0.467647
[Metrics] logloss: 1.445153 - AUC: 0.582353
[Metrics] logloss: 1.402739 - AUC: 0.618768
[Metrics] logloss: 2.210657 - AUC: 0.413086
[Metrics] logloss: 1.112972 - AUC: 0.677734
[Metrics] logloss: 1.882559 - AUC: 0.485826
[Metrics] logloss: 1.917402 - AUC: 0.472656
[Metrics] logloss: 2.366392 - AUC: 0.401760
[Metrics] logloss: 1.604059 - AUC: 0.507843
[Metrics] logloss: 2.733787 - AUC: 0.312831
[Metrics] logloss: 1.982845 - AUC: 0.459113
[Metrics] logloss: 2.498603 - AU

[{'Test Metrics': {'AUC': tensor(0.5042), 'logloss': tensor(1.8953)}}]

## Criteo sample dataset

In [None]:
#export
import math
import shutil
import struct
from collections import defaultdict
from functools import lru_cache
from pathlib import Path

import lmdb
import numpy as np
import pandas as pd
import torch.utils.data
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, TensorDataset

from recohut.datasets.bases.common import Dataset
from recohut.utils.common_utils import download_url

In [None]:
#export
class CriteoSampleDataset(Dataset):
    """Criteo Sample Dataset

    Reference:
        1. https://github.com/huangjunheng/recommendation_model/blob/master/DCN/dcn.py
    """
    url = 'https://github.com/RecoHut-Datasets/criteo/raw/v1/dac_sample.txt'

    def __init__(self, root, test_size=0.2, random_seed=42):
        super().__init__(root)
        self.test_size = test_size
        self.random_seed = random_seed

        self._process()

    @property
    def raw_file_names(self) -> str:
        return 'dac_sample.txt'

    @property
    def processed_file_names(self) -> str:
        return ['train.pt', 'test.pt']

    def download(self):
        path = download_url(self.url, self.raw_dir)

    def process(self):
        sparse_feature = ['C' + str(i) for i in range(1, 27)]
        dense_feature = ['I' + str(i) for i in range(1, 14)]
        col_names = ['label'] + dense_feature + sparse_feature
        data = pd.read_csv(self.raw_paths[0], names=col_names, sep='\t')

        data[sparse_feature] = data[sparse_feature].fillna('-1', )
        data[dense_feature] = data[dense_feature].fillna('0',)

        feat_sizes = {}
        feat_sizes_dense = {feat:1 for feat in dense_feature}
        feat_sizes_sparse = {feat:len(data[feat].unique()) for feat in sparse_feature}
        feat_sizes.update(feat_sizes_dense)
        feat_sizes.update(feat_sizes_sparse)
        self.feat_sizes = feat_sizes

        from sklearn.preprocessing import LabelEncoder, MinMaxScaler
        for feat in sparse_feature:
            lbe = LabelEncoder()
            data[feat] = lbe.fit_transform(data[feat])
        nms = MinMaxScaler(feature_range=(0, 1))
        data[dense_feature] = nms.fit_transform(data[dense_feature])

        fixlen_feature_columns = [(feat,'sparse') for feat in sparse_feature ]  + [(feat,'dense') for feat in dense_feature]
        self.dnn_feature_columns = fixlen_feature_columns
        self.linear_feature_columns = fixlen_feature_columns

        from sklearn.model_selection import train_test_split
        train, test = train_test_split(data, test_size=self.test_size, random_state=self.random_seed)

        train_label = pd.DataFrame(train['label'])
        train = train.drop(columns=['label'])
        train_tensor_data = TensorDataset(torch.from_numpy(np.array(train)), torch.from_numpy(np.array(train_label)))
        torch.save(train_tensor_data, self.processed_paths[0])

        test_label = pd.DataFrame(test['label'])
        test = test.drop(columns=['label'])
        test_tensor_data = TensorDataset(torch.from_numpy(np.array(test)), torch.from_numpy(np.array(test_label)))
        torch.save(test_tensor_data, self.processed_paths[1])

    def load(self):
        train_tensor_data = torch.load(self.processed_paths[0])
        test_tensor_data = torch.load(self.processed_paths[1])
        return train_tensor_data, test_tensor_data

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import log_loss, roc_auc_score

from recohut.models.dcn import DCNv2 as DCN


def get_auc(loader, model):
    pred, target = [], []
    model.eval()
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device).float(), y.to(device).float()
            y_hat = model(x)
            pred += list(y_hat.cpu().numpy())
            target += list(y.cpu().numpy())
    auc = roc_auc_score(target, pred)
    return auc


root = '/content/data'
batch_size = 1024
lr = 1e-2
wd = 1e-3
epoches = 20
seed = 2022
embedding_size = 4
device = 'cpu'

ds = CriteoSampleDataset(root=root)
train_tensor_data, test_tensor_data = ds.load()
train_loader = DataLoader(train_tensor_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_tensor_data, batch_size=batch_size)

model = DCN(ds.feat_sizes, embedding_size, ds.linear_feature_columns, ds.dnn_feature_columns).to(device)
loss_func = nn.BCELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

for epoch in range(epoches):
    total_loss_epoch = 0.0
    total_tmp = 0
    model.train()
    for index, (x, y) in enumerate(train_loader):
        x, y = x.to(device).float(), y.to(device).float()
        y_hat = model(x)
        optimizer.zero_grad()
        loss = loss_func(y_hat, y)
        loss.backward()
        optimizer.step()
        total_loss_epoch += loss.item()
        total_tmp += 1
    auc = get_auc(test_loader, model)
    print('epoch/epoches: {}/{}, train loss: {:.3f}, test auc: {:.3f}'.format(epoch, epoches, total_loss_epoch / total_tmp, auc))

Processing...
Done!


epoch/epoches: 0/20, train loss: 0.536, test auc: 0.679
epoch/epoches: 1/20, train loss: 0.499, test auc: 0.702
epoch/epoches: 2/20, train loss: 0.486, test auc: 0.722
epoch/epoches: 3/20, train loss: 0.477, test auc: 0.740
epoch/epoches: 4/20, train loss: 0.468, test auc: 0.745
epoch/epoches: 5/20, train loss: 0.463, test auc: 0.752
epoch/epoches: 6/20, train loss: 0.460, test auc: 0.757
epoch/epoches: 7/20, train loss: 0.458, test auc: 0.753
epoch/epoches: 8/20, train loss: 0.455, test auc: 0.758
epoch/epoches: 9/20, train loss: 0.452, test auc: 0.759
epoch/epoches: 10/20, train loss: 0.450, test auc: 0.757
epoch/epoches: 11/20, train loss: 0.449, test auc: 0.758
epoch/epoches: 12/20, train loss: 0.444, test auc: 0.758
epoch/epoches: 13/20, train loss: 0.438, test auc: 0.757
epoch/epoches: 14/20, train loss: 0.428, test auc: 0.753
epoch/epoches: 15/20, train loss: 0.413, test auc: 0.750
epoch/epoches: 16/20, train loss: 0.395, test auc: 0.744
epoch/epoches: 17/20, train loss: 0.381, 

## Criteo dataset

In [None]:
# #export
# class CriteoDataset(torch.utils.data.Dataset):
#     """
#     Criteo Display Advertising Challenge Dataset
#     Data prepration:
#         * Remove the infrequent features (appearing in less than threshold instances) and treat them as a single feature
#         * Discretize numerical values by log2 transformation which is proposed by the winner of Criteo Competition
#     :param dataset_path: criteo train.txt path.
#     :param cache_path: lmdb cache path.
#     :param rebuild_cache: If True, lmdb cache is refreshed.
#     :param min_threshold: infrequent feature threshold.
#     Reference:
#         https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset
#         https://www.csie.ntu.edu.tw/~r01922136/kaggle-2014-criteo.pdf
#     """

#     def __init__(self, dataset_path=None, cache_path='.criteo', rebuild_cache=False, min_threshold=10):
#         self.NUM_FEATS = 39
#         self.NUM_INT_FEATS = 13
#         self.min_threshold = min_threshold
#         if rebuild_cache or not Path(cache_path).exists():
#             shutil.rmtree(cache_path, ignore_errors=True)
#             if dataset_path is None:
#                 raise ValueError('create cache: failed: dataset_path is None')
#             self.__build_cache(dataset_path, cache_path)
#         self.env = lmdb.open(cache_path, create=False, lock=False, readonly=True)
#         with self.env.begin(write=False) as txn:
#             self.length = txn.stat()['entries'] - 1
#             self.field_dims = np.frombuffer(txn.get(b'field_dims'), dtype=np.uint32)

#     def __getitem__(self, index):
#         with self.env.begin(write=False) as txn:
#             np_array = np.frombuffer(
#                 txn.get(struct.pack('>I', index)), dtype=np.uint32).astype(dtype=np.long)
#         return np_array[1:], np_array[0]

#     def __len__(self):
#         return self.length

#     def __build_cache(self, path, cache_path):
#         feat_mapper, defaults = self.__get_feat_mapper(path)
#         with lmdb.open(cache_path, map_size=int(1e11)) as env:
#             field_dims = np.zeros(self.NUM_FEATS, dtype=np.uint32)
#             for i, fm in feat_mapper.items():
#                 field_dims[i - 1] = len(fm) + 1
#             with env.begin(write=True) as txn:
#                 txn.put(b'field_dims', field_dims.tobytes())
#             for buffer in self.__yield_buffer(path, feat_mapper, defaults):
#                 with env.begin(write=True) as txn:
#                     for key, value in buffer:
#                         txn.put(key, value)

#     def __get_feat_mapper(self, path):
#         feat_cnts = defaultdict(lambda: defaultdict(int))
#         with open(path) as f:
#             pbar = tqdm(f, mininterval=1, smoothing=0.1)
#             pbar.set_description('Create criteo dataset cache: counting features')
#             for line in pbar:
#                 values = line.rstrip('\n').split('\t')
#                 if len(values) != self.NUM_FEATS + 1:
#                     continue
#                 for i in range(1, self.NUM_INT_FEATS + 1):
#                     feat_cnts[i][convert_numeric_feature(values[i])] += 1
#                 for i in range(self.NUM_INT_FEATS + 1, self.NUM_FEATS + 1):
#                     feat_cnts[i][values[i]] += 1
#         feat_mapper = {i: {feat for feat, c in cnt.items() if c >= self.min_threshold} for i, cnt in feat_cnts.items()}
#         feat_mapper = {i: {feat: idx for idx, feat in enumerate(cnt)} for i, cnt in feat_mapper.items()}
#         defaults = {i: len(cnt) for i, cnt in feat_mapper.items()}
#         return feat_mapper, defaults

#     def __yield_buffer(self, path, feat_mapper, defaults, buffer_size=int(1e5)):
#         item_idx = 0
#         buffer = list()
#         with open(path) as f:
#             pbar = tqdm(f, mininterval=1, smoothing=0.1)
#             pbar.set_description('Create criteo dataset cache: setup lmdb')
#             for line in pbar:
#                 values = line.rstrip('\n').split('\t')
#                 if len(values) != self.NUM_FEATS + 1:
#                     continue
#                 np_array = np.zeros(self.NUM_FEATS + 1, dtype=np.uint32)
#                 np_array[0] = int(values[0])
#                 for i in range(1, self.NUM_INT_FEATS + 1):
#                     np_array[i] = feat_mapper[i].get(convert_numeric_feature(values[i]), defaults[i])
#                 for i in range(self.NUM_INT_FEATS + 1, self.NUM_FEATS + 1):
#                     np_array[i] = feat_mapper[i].get(values[i], defaults[i])
#                 buffer.append((struct.pack('>I', item_idx), np_array.tobytes()))
#                 item_idx += 1
#                 if item_idx % buffer_size == 0:
#                     yield buffer
#                     buffer.clear()
#             yield buffer


# @lru_cache(maxsize=None)
# def convert_numeric_feature(val: str):
#     if val == '':
#         return 'NULL'
#     v = int(val)
#     if v > 2:
#         return str(int(math.log(v) ** 2))
#     else:
#         return str(v - 2)

## Criteo Dataset Transformation
> Implementation of transformation functions specific to criteo ad-display dataset.

In [None]:
#export
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
from sklearn.model_selection import train_test_split

In [None]:
#export
def sparseFeature(feat, feat_num, embed_dim=4):
    """
    create dictionary for sparse feature
    :param feat: feature name
    :param feat_num: the total number of sparse features that do not repeat
    :param embed_dim: embedding dimension
    :return:
    """
    return {'feat_name': feat, 'feat_num': feat_num, 'embed_dim': embed_dim}


def denseFeature(feat):
    """
    create dictionary for dense feature
    :param feat: dense feature name
    :return:
    """
    return {'feat_name': feat}

In [None]:
#export
def create_criteo_dataset(file, embed_dim=8, read_part=True, sample_num=100000, test_size=0.2):
    """
    a example about creating criteo dataset
    :param file: dataset's path
    :param embed_dim: the embedding dimension of sparse features
    :param read_part: whether to read part of it
    :param sample_num: the number of instances if read_part is True
    :param test_size: ratio of test dataset
    :return: feature columns, train, test
    """
    names = ['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11',
             'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11',
             'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22',
             'C23', 'C24', 'C25', 'C26']

    if read_part:
        data_df = pd.read_csv(file, sep='\t', iterator=True, header=None,
                          names=names)
        data_df = data_df.get_chunk(sample_num)

    else:
        data_df = pd.read_csv(file, sep='\t', header=None, names=names)

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]
    features = sparse_features + dense_features

    data_df[sparse_features] = data_df[sparse_features].fillna('-1')
    data_df[dense_features] = data_df[dense_features].fillna(0)

    # Bin continuous data into intervals.
    est = KBinsDiscretizer(n_bins=100, encode='ordinal', strategy='uniform')
    data_df[dense_features] = est.fit_transform(data_df[dense_features])

    for feat in sparse_features:
        le = LabelEncoder()
        data_df[feat] = le.fit_transform(data_df[feat])

    # ==============Feature Engineering===================

    # ====================================================
    feature_columns = [sparseFeature(feat, int(data_df[feat].max()) + 1, embed_dim=embed_dim)
                        for feat in features]
    train, test = train_test_split(data_df, test_size=test_size)

    train_X = train[features].values.astype('int32')
    train_y = train['label'].values.astype('int32')
    test_X = test[features].values.astype('int32')
    test_y = test['label'].values.astype('int32')

    return feature_columns, (train_X, train_y), (test_X, test_y)

In [None]:
# !pip install -q -U kaggle
# !pip install --upgrade --force-reinstall --no-deps kaggle
# !mkdir ~/.kaggle
# !cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle datasets download -d mrkmakr/criteo-dataset
# !unzip criteo-dataset.zip

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[?25l[K     |█████▋                          | 10 kB 19.0 MB/s eta 0:00:01[K     |███████████▏                    | 20 kB 20.9 MB/s eta 0:00:01[K     |████████████████▊               | 30 kB 14.2 MB/s eta 0:00:01[K     |██████████████████████▎         | 40 kB 10.5 MB/s eta 0:00:01[K     |███████████████████████████▉    | 51 kB 4.5 MB/s eta 0:00:01[K     |████████████████████████████████| 58 kB 2.7 MB/s 
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73051 sha256=a09d2576937c68b6341e6bce9eeefa020563e125d97e69548f4d591568008b5f
  Stored in directory: /root/.cache/pip/wheels/62/d6/58/5853130f941e75b2177d281eb7e44b4a98ed46dd155f556dc5
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.12
    Un

In [None]:
file = 'dac/train.txt'
read_part = True
sample_num = 10000
test_size = 0.2

feature_columns, train, test = create_criteo_dataset(file=file,
                                        read_part=read_part,
                                        sample_num=sample_num,
                                        test_size=test_size)

In [None]:
feature_columns

[{'embed_dim': 8, 'feat_name': 'C1', 'feat_num': 175},
 {'embed_dim': 8, 'feat_name': 'C2', 'feat_num': 386},
 {'embed_dim': 8, 'feat_name': 'C3', 'feat_num': 5521},
 {'embed_dim': 8, 'feat_name': 'C4', 'feat_num': 4033},
 {'embed_dim': 8, 'feat_name': 'C5', 'feat_num': 56},
 {'embed_dim': 8, 'feat_name': 'C6', 'feat_num': 8},
 {'embed_dim': 8, 'feat_name': 'C7', 'feat_num': 3184},
 {'embed_dim': 8, 'feat_name': 'C8', 'feat_num': 93},
 {'embed_dim': 8, 'feat_name': 'C9', 'feat_num': 3},
 {'embed_dim': 8, 'feat_name': 'C10', 'feat_num': 2986},
 {'embed_dim': 8, 'feat_name': 'C11', 'feat_num': 2084},
 {'embed_dim': 8, 'feat_name': 'C12', 'feat_num': 5284},
 {'embed_dim': 8, 'feat_name': 'C13', 'feat_num': 1725},
 {'embed_dim': 8, 'feat_name': 'C14', 'feat_num': 24},
 {'embed_dim': 8, 'feat_name': 'C15', 'feat_num': 2035},
 {'embed_dim': 8, 'feat_name': 'C16', 'feat_num': 4724},
 {'embed_dim': 8, 'feat_name': 'C17', 'feat_num': 9},
 {'embed_dim': 8, 'feat_name': 'C18', 'feat_num': 1149},


In [None]:
train

(array([[   1,  293, 2491, ...,    0,    0,    1],
        [   1,   88,    0, ...,    1,    0,    1],
        [   1,   17, 5197, ...,    1,    0,    0],
        ...,
        [   1,  355, 4284, ...,    3,    0,    0],
        [   1,  192,   56, ...,    1,    0,    0],
        [  75,   18, 2613, ...,    3,    0,    0]], dtype=int32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int32))

In [None]:
test

(array([[ 111,  105,  695, ...,    3,    0,    0],
        [ 102,  337, 2613, ...,    0,    0,    1],
        [  75,  301,  155, ...,    1,    0,    0],
        ...,
        [  75,   86,  507, ...,    1,    1,    1],
        [   1,  347, 2205, ...,    2,    1,    1],
        [ 102,  125,    5, ...,    1,    1,    0]], dtype=int32),
 array([1, 0, 1, ..., 0, 0, 0], dtype=int32))

> **References:-**
- https://github.com/rixwew/pytorch-fm/blob/master/torchfm/dataset/criteo.py

In [None]:
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2022-01-07 08:45:18

recohut: 0.0.9

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

matplotlib: 3.2.2
pandas    : 1.1.5
lmdb      : 0.99
PIL       : 7.1.2
IPython   : 5.5.0
numpy     : 1.19.5
torch     : 1.10.0+cu111

