In [None]:
# default_exp datasets.taobao

# Taobao Dataset
> Implementation of Taobao dataset in Pytorch lightning.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.nb_imports import *
from fastcore.test import *

In [None]:
#export
from recohut.datasets.bases.ctr import *
from recohut.utils.common_utils import download_url

from datetime import date

In [None]:
#export
class TaobaoDataset(CTRDataset):

    feature_cols = [{'name': ["userid","adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid",
                                "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"],
                        'active': True, 'dtype': 'str', 'type': 'categorical'}]
                        
    label_col = {'name': 'clk', 'dtype': float}

    train_url = "https://github.com/RecoHut-Datasets/sample_ctr/raw/v1/train_sample.csv"
    valid_url = "https://github.com/RecoHut-Datasets/sample_ctr/raw/v1/valid_sample.csv"
    test_url = "https://github.com/RecoHut-Datasets/sample_ctr/raw/v1/test_sample.csv"

    @property
    def raw_file_names(self):
        return ['train_sample.csv',
                'valid_sample.csv',
                'test_sample.csv']

    def download(self):
        download_url(self.train_url, self.raw_dir)
        download_url(self.valid_url, self.raw_dir)
        download_url(self.test_url, self.raw_dir)

    def convert_hour(self, df, col_name):
        return df['time_stamp'].apply(lambda ts: ts[11:13])

    def convert_weekday(self, df, col_name):
        def _convert_weekday(timestamp):
            dt = date(int(timestamp[0:4]), int(timestamp[5:7]), int(timestamp[8:10]))
            return dt.strftime('%w')
        return df['time_stamp'].apply(_convert_weekday)

    def convert_weekend(self, df, col_name):
        def _convert_weekend(timestamp):
            dt = date(int(timestamp[0:4]), int(timestamp[5:7]), int(timestamp[8:10]))
            return '1' if dt.strftime('%w') in ['6', '0'] else '0'
        return df['time_stamp'].apply(_convert_weekend)

In [None]:
#export
class TaobaoDataModule(CTRDataModule):
    dataset_cls = TaobaoDataset

In [None]:
params = {'model_id': 'DCN_demo',
              'data_dir': '/content/data',
              'model_root': './checkpoints/',
              'dnn_hidden_units': [64, 64],
              'dnn_activations': "relu",
              'crossing_layers': 3,
              'learning_rate': 1e-3,
              'net_dropout': 0,
              'batch_norm': False,
              'optimizer': 'adamw',
              'task': 'binary_classification',
              'loss': 'binary_crossentropy',
              'metrics': ['logloss', 'AUC'],
              'embedding_dim': 10,
              'batch_size': 64,
              'epochs': 3,
              'shuffle': True,
              'seed': 2019,
              'use_hdf5': True,
              'workers': 1,
              'verbose': 0}

In [None]:
ds = TaobaoDataModule(**params)
ds.prepare_data()
ds.setup()

for batch in ds.train_dataloader():
    print(batch)
    break

[tensor([[20., 96.,  1., 18., 94., 93., 63.,  9.,  9.,  2.,  2.,  2.,  1.,  1.],
        [18., 78.,  1., 43., 76., 75.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.],
        [ 5., 52.,  1.,  1., 51., 51.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 5., 35.,  1.,  7.,  2.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 1., 26.,  1., 23., 26., 26., 22.,  1.,  1.,  1.,  1.,  0.,  1.,  1.],
        [20., 75.,  1., 42., 73., 72.,  7.,  9.,  9.,  2.,  2.,  2.,  1.,  1.],
        [ 2., 17.,  1.,  2., 17., 17., 16.,  1.,  2.,  1.,  2.,  0.,  1.,  1.],
        [15., 55.,  2., 10., 53., 53., 41.,  6.,  7.,  1.,  5.,  1.,  1.,  1.],
        [ 3., 72.,  1., 40., 70., 69., 53.,  1.,  4.,  1.,  3.,  0.,  1.,  1.],
        [ 3., 84.,  1., 45., 82., 81., 57.,  1.,  4.,  1.,  3.,  0.,  1.,  1.],
        [ 1., 20.,  1.,  9., 20., 20.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.],
        [ 3., 81.,  1.,  6., 79., 78.,  0.,  1.,  4.,  1.,  3.,  0.,  1.,  1.],
        [ 1., 27.,  1.,  9., 27., 27., 

  "DataModule property `train_transforms` was deprecated in v1.5 and will be removed in v1.7."
Processing...
Done!


In [None]:
#hide
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2022-01-11 14:40:05

recohut: 0.0.11

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.144+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

numpy     : 1.19.5
matplotlib: 3.2.2
IPython   : 5.5.0
PIL       : 7.1.2

