In [None]:
# default_exp transforms.datasets.criteo

# Criteo Dataset Transformation
> Implementation of transformation functions specific to criteo ad-display dataset.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
from sklearn.model_selection import train_test_split

In [None]:
#export
def sparseFeature(feat, feat_num, embed_dim=4):
    """
    create dictionary for sparse feature
    :param feat: feature name
    :param feat_num: the total number of sparse features that do not repeat
    :param embed_dim: embedding dimension
    :return:
    """
    return {'feat_name': feat, 'feat_num': feat_num, 'embed_dim': embed_dim}


def denseFeature(feat):
    """
    create dictionary for dense feature
    :param feat: dense feature name
    :return:
    """
    return {'feat_name': feat}

In [None]:
#export
def create_criteo_dataset(file, embed_dim=8, read_part=True, sample_num=100000, test_size=0.2):
    """
    a example about creating criteo dataset
    :param file: dataset's path
    :param embed_dim: the embedding dimension of sparse features
    :param read_part: whether to read part of it
    :param sample_num: the number of instances if read_part is True
    :param test_size: ratio of test dataset
    :return: feature columns, train, test
    """
    names = ['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11',
             'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11',
             'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22',
             'C23', 'C24', 'C25', 'C26']

    if read_part:
        data_df = pd.read_csv(file, sep='\t', iterator=True, header=None,
                          names=names)
        data_df = data_df.get_chunk(sample_num)

    else:
        data_df = pd.read_csv(file, sep='\t', header=None, names=names)

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]
    features = sparse_features + dense_features

    data_df[sparse_features] = data_df[sparse_features].fillna('-1')
    data_df[dense_features] = data_df[dense_features].fillna(0)

    # Bin continuous data into intervals.
    est = KBinsDiscretizer(n_bins=100, encode='ordinal', strategy='uniform')
    data_df[dense_features] = est.fit_transform(data_df[dense_features])

    for feat in sparse_features:
        le = LabelEncoder()
        data_df[feat] = le.fit_transform(data_df[feat])

    # ==============Feature Engineering===================

    # ====================================================
    feature_columns = [sparseFeature(feat, int(data_df[feat].max()) + 1, embed_dim=embed_dim)
                        for feat in features]
    train, test = train_test_split(data_df, test_size=test_size)

    train_X = train[features].values.astype('int32')
    train_y = train['label'].values.astype('int32')
    test_X = test[features].values.astype('int32')
    test_y = test['label'].values.astype('int32')

    return feature_columns, (train_X, train_y), (test_X, test_y)

In [None]:
# !pip install -q -U kaggle
# !pip install --upgrade --force-reinstall --no-deps kaggle
# !mkdir ~/.kaggle
# !cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle datasets download -d mrkmakr/criteo-dataset
# !unzip criteo-dataset.zip

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[?25l[K     |█████▋                          | 10 kB 19.0 MB/s eta 0:00:01[K     |███████████▏                    | 20 kB 20.9 MB/s eta 0:00:01[K     |████████████████▊               | 30 kB 14.2 MB/s eta 0:00:01[K     |██████████████████████▎         | 40 kB 10.5 MB/s eta 0:00:01[K     |███████████████████████████▉    | 51 kB 4.5 MB/s eta 0:00:01[K     |████████████████████████████████| 58 kB 2.7 MB/s 
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73051 sha256=a09d2576937c68b6341e6bce9eeefa020563e125d97e69548f4d591568008b5f
  Stored in directory: /root/.cache/pip/wheels/62/d6/58/5853130f941e75b2177d281eb7e44b4a98ed46dd155f556dc5
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.12
    Un

In [None]:
file = 'dac/train.txt'
read_part = True
sample_num = 10000
test_size = 0.2

feature_columns, train, test = create_criteo_dataset(file=file,
                                        read_part=read_part,
                                        sample_num=sample_num,
                                        test_size=test_size)

In [None]:
feature_columns

[{'embed_dim': 8, 'feat_name': 'C1', 'feat_num': 175},
 {'embed_dim': 8, 'feat_name': 'C2', 'feat_num': 386},
 {'embed_dim': 8, 'feat_name': 'C3', 'feat_num': 5521},
 {'embed_dim': 8, 'feat_name': 'C4', 'feat_num': 4033},
 {'embed_dim': 8, 'feat_name': 'C5', 'feat_num': 56},
 {'embed_dim': 8, 'feat_name': 'C6', 'feat_num': 8},
 {'embed_dim': 8, 'feat_name': 'C7', 'feat_num': 3184},
 {'embed_dim': 8, 'feat_name': 'C8', 'feat_num': 93},
 {'embed_dim': 8, 'feat_name': 'C9', 'feat_num': 3},
 {'embed_dim': 8, 'feat_name': 'C10', 'feat_num': 2986},
 {'embed_dim': 8, 'feat_name': 'C11', 'feat_num': 2084},
 {'embed_dim': 8, 'feat_name': 'C12', 'feat_num': 5284},
 {'embed_dim': 8, 'feat_name': 'C13', 'feat_num': 1725},
 {'embed_dim': 8, 'feat_name': 'C14', 'feat_num': 24},
 {'embed_dim': 8, 'feat_name': 'C15', 'feat_num': 2035},
 {'embed_dim': 8, 'feat_name': 'C16', 'feat_num': 4724},
 {'embed_dim': 8, 'feat_name': 'C17', 'feat_num': 9},
 {'embed_dim': 8, 'feat_name': 'C18', 'feat_num': 1149},


In [None]:
train

(array([[   1,  293, 2491, ...,    0,    0,    1],
        [   1,   88,    0, ...,    1,    0,    1],
        [   1,   17, 5197, ...,    1,    0,    0],
        ...,
        [   1,  355, 4284, ...,    3,    0,    0],
        [   1,  192,   56, ...,    1,    0,    0],
        [  75,   18, 2613, ...,    3,    0,    0]], dtype=int32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int32))

In [None]:
test

(array([[ 111,  105,  695, ...,    3,    0,    0],
        [ 102,  337, 2613, ...,    0,    0,    1],
        [  75,  301,  155, ...,    1,    0,    0],
        ...,
        [  75,   86,  507, ...,    1,    1,    1],
        [   1,  347, 2205, ...,    2,    1,    1],
        [ 102,  125,    5, ...,    1,    1,    0]], dtype=int32),
 array([1, 0, 1, ..., 0, 0, 0], dtype=int32))

In [None]:
#hide
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2021-12-20 07:55:46

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

numpy  : 1.19.5
IPython: 5.5.0
pandas : 1.1.5

