In [None]:
# default_exp datasets.avazu

# Avazu
> Avazu dataset.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
from recohut.datasets.bases.ctr import *
from recohut.utils.common_utils import download_url

from datetime import date

In [None]:
#export
class AvazuDataset(CTRDataset):

    feature_cols = [{'name': 'id', 'active': False, 'dtype': 'str', 'type': 'categorical'},
         {'name': 'hour', 'active': True, 'dtype': 'str', 'type': 'categorical', 'preprocess': 'convert_hour'},
         {'name': ['C1','banner_pos','site_id','site_domain','site_category','app_id','app_domain','app_category','device_id',
                   'device_ip','device_model','device_type','device_conn_type','C14','C15','C16','C17','C18','C19','C20','C21'], 
           'active': True, 'dtype': 'str', 'type': 'categorical'},
         {'name': 'weekday', 'active': True, 'dtype': 'str', 'type': 'categorical', 'preprocess': 'convert_weekday'},
         {'name': 'weekend', 'active': True, 'dtype': 'str', 'type': 'categorical', 'preprocess': 'convert_weekend'}]
                        
    label_col = {'name': 'click', 'dtype': float}

    train_url = "https://github.com/RecoHut-Datasets/avazu/raw/v1/train.csv"
    valid_url = "https://github.com/RecoHut-Datasets/avazu/raw/v1/valid.csv"
    test_url = "https://github.com/RecoHut-Datasets/avazu/raw/v1/test.csv"

    @property
    def raw_file_names(self):
        return ['train.csv',
                'valid.csv',
                'test.csv']

    def download(self):
        download_url(self.train_url, self.raw_dir)
        download_url(self.valid_url, self.raw_dir)
        download_url(self.test_url, self.raw_dir)

    def convert_weekday(self, df, col_name):
        def _convert_weekday(timestamp):
            dt = date(int('20' + timestamp[0:2]), int(timestamp[2:4]), int(timestamp[4:6]))
            return int(dt.strftime('%w'))
        return df['hour'].apply(_convert_weekday)

    def convert_weekend(self, df, col_name):
        def _convert_weekend(timestamp):
            dt = date(int('20' + timestamp[0:2]), int(timestamp[2:4]), int(timestamp[4:6]))
            return 1 if dt.strftime('%w') in ['6', '0'] else 0
        return df['hour'].apply(_convert_weekend)

    def convert_hour(self, df, col_name):
        return df['hour'].apply(lambda x: int(x[6:8]))

In [None]:
#export
class AvazuDataModule(CTRDataModule):
    dataset_cls = AvazuDataset

Example

In [None]:
params = {'model_id': 'DCN_demo',
              'data_dir': '/content/data',
              'model_root': './checkpoints/',
              'dnn_hidden_units': [64, 64],
              'dnn_activations': "relu",
              'crossing_layers': 3,
              'learning_rate': 1e-3,
              'net_dropout': 0,
              'batch_norm': False,
              'optimizer': 'adamw',
              'task': 'binary_classification',
              'loss': 'binary_crossentropy',
              'metrics': ['logloss', 'AUC'],
              'embedding_dim': 10,
              'batch_size': 64,
              'epochs': 3,
              'shuffle': True,
              'seed': 2019,
              'use_hdf5': True,
              'workers': 1,
              'verbose': 0}

In [None]:
!rm -r /content/data
ds = AvazuDataModule(**params)
ds.prepare_data()
ds.setup()

for batch in ds.train_dataloader():
    print(batch)
    break

  "DataModule property `train_transforms` was deprecated in v1.5 and will be removed in v1.7."
Downloading https://github.com/RecoHut-Datasets/avazu/raw/v1/train.csv
Downloading https://github.com/RecoHut-Datasets/avazu/raw/v1/valid.csv
Downloading https://github.com/RecoHut-Datasets/avazu/raw/v1/test.csv
Processing...
Done!


[tensor([[1., 1., 2.,  ..., 6., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 2.,  ..., 2., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 4., 1., 1.]], dtype=torch.float64), tensor([0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0.], dtype=torch.float64)]


In [None]:
from recohut.models.deepcrossing import DeepCrossing

params = {'model_id': 'DeepCrossing',
              'data_dir': '/content/data',
              'model_root': './checkpoints/',
              'dnn_hidden_units': [64, 64],
              'dnn_activations': "relu",
              'learning_rate': 1e-3,
              'net_dropout': 0,
              'batch_norm': False,
              'optimizer': 'adamw',
              'use_residual': True,
              'residual_blocks': [500, 500, 500],
              'task': 'binary_classification',
              'loss': 'binary_crossentropy',
              'metrics': ['logloss', 'AUC'],
              'embedding_dim': 10,
              'batch_size': 64,
              'epochs': 3,
              'shuffle': True,
              'seed': 2019,
              'use_hdf5': True,
              'workers': 1,
              'verbose': 0}

model = DeepCrossing(ds.dataset.feature_map, **params)

In [None]:
from recohut.trainers.pl_trainer import pl_trainer

pl_trainer(model, ds, max_epochs=5)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  f"DataModule.{name} has already been called, so it will not be called again. "

  | Name              | Type           | Params
-----------------------------------------------------
0 | embedding_layer   | EmbeddingLayer | 12.6 K
1 | crossing_layer    | Sequential     | 722 K 
2 | output_activation | Sigmoid        | 0     
-----------------------------------------------------
735 K     Trainable params
0         Non-trainable params
735 K     Total params
2.940     Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AU

  f"DataModule.{name} has already been called, so it will not be called again. "


Testing: 0it [00:00, ?it/s]

[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AUC: 1.000000
[Metrics] logloss: 0.000000 - AU

[{'Test Metrics': {'AUC': tensor(1.), 'logloss': tensor(1.0680e-07)}}]

In [None]:
# #export
# import shutil
# import struct
# from collections import defaultdict
# from pathlib import Path

# import lmdb
# import numpy as np
# import torch.utils.data
# from tqdm import tqdm

In [None]:
# #export
# class AvazuDataset(torch.utils.data.Dataset):
#     """
#     Avazu Click-Through Rate Prediction Dataset
#     Dataset preparation
#         Remove the infrequent features (appearing in less than threshold instances) and treat them as a single feature
#     :param dataset_path: avazu train path
#     :param cache_path: lmdb cache path
#     :param rebuild_cache: If True, lmdb cache is refreshed
#     :param min_threshold: infrequent feature threshold
#     Reference
#         https://www.kaggle.com/c/avazu-ctr-prediction
#     """

#     def __init__(self, dataset_path=None, cache_path='.avazu', rebuild_cache=False, min_threshold=4):
#         self.NUM_FEATS = 22
#         self.min_threshold = min_threshold
#         if rebuild_cache or not Path(cache_path).exists():
#             shutil.rmtree(cache_path, ignore_errors=True)
#             if dataset_path is None:
#                 raise ValueError('create cache: failed: dataset_path is None')
#             self.__build_cache(dataset_path, cache_path)
#         self.env = lmdb.open(cache_path, create=False, lock=False, readonly=True)
#         with self.env.begin(write=False) as txn:
#             self.length = txn.stat()['entries'] - 1
#             self.field_dims = np.frombuffer(txn.get(b'field_dims'), dtype=np.uint32)

#     def __getitem__(self, index):
#         with self.env.begin(write=False) as txn:
#             np_array = np.frombuffer(
#                 txn.get(struct.pack('>I', index)), dtype=np.uint32).astype(dtype=np.long)
#         return np_array[1:], np_array[0]

#     def __len__(self):
#         return self.length

#     def __build_cache(self, path, cache_path):
#         feat_mapper, defaults = self.__get_feat_mapper(path)
#         with lmdb.open(cache_path, map_size=int(1e11)) as env:
#             field_dims = np.zeros(self.NUM_FEATS, dtype=np.uint32)
#             for i, fm in feat_mapper.items():
#                 field_dims[i - 1] = len(fm) + 1
#             with env.begin(write=True) as txn:
#                 txn.put(b'field_dims', field_dims.tobytes())
#             for buffer in self.__yield_buffer(path, feat_mapper, defaults):
#                 with env.begin(write=True) as txn:
#                     for key, value in buffer:
#                         txn.put(key, value)

#     def __get_feat_mapper(self, path):
#         feat_cnts = defaultdict(lambda: defaultdict(int))
#         with open(path) as f:
#             f.readline()
#             pbar = tqdm(f, mininterval=1, smoothing=0.1)
#             pbar.set_description('Create avazu dataset cache: counting features')
#             for line in pbar:
#                 values = line.rstrip('\n').split(',')
#                 if len(values) != self.NUM_FEATS + 2:
#                     continue
#                 for i in range(1, self.NUM_FEATS + 1):
#                     feat_cnts[i][values[i + 1]] += 1
#         feat_mapper = {i: {feat for feat, c in cnt.items() if c >= self.min_threshold} for i, cnt in feat_cnts.items()}
#         feat_mapper = {i: {feat: idx for idx, feat in enumerate(cnt)} for i, cnt in feat_mapper.items()}
#         defaults = {i: len(cnt) for i, cnt in feat_mapper.items()}
#         return feat_mapper, defaults

#     def __yield_buffer(self, path, feat_mapper, defaults, buffer_size=int(1e5)):
#         item_idx = 0
#         buffer = list()
#         with open(path) as f:
#             f.readline()
#             pbar = tqdm(f, mininterval=1, smoothing=0.1)
#             pbar.set_description('Create avazu dataset cache: setup lmdb')
#             for line in pbar:
#                 values = line.rstrip('\n').split(',')
#                 if len(values) != self.NUM_FEATS + 2:
#                     continue
#                 np_array = np.zeros(self.NUM_FEATS + 1, dtype=np.uint32)
#                 np_array[0] = int(values[1])
#                 for i in range(1, self.NUM_FEATS + 1):
#                     np_array[i] = feat_mapper[i].get(values[i+1], defaults[i])
#                 buffer.append((struct.pack('>I', item_idx), np_array.tobytes()))
#                 item_idx += 1
#                 if item_idx % buffer_size == 0:
#                     yield buffer
#                     buffer.clear()
#             yield buffer

> **References:-**
- https://github.com/rixwew/pytorch-fm/blob/master/torchfm/dataset/avazu.py

In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut,pytorch_lightning

Author: Sparsh A.

Last updated: 2022-01-11 22:08:24

recohut          : 0.0.11
pytorch_lightning: 1.5.8

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

IPython: 5.5.0

