# Creation of offer datasets

## imports
dask related

In [1]:
import dask.bag as _dbag

# inside our notebook the dask extension manages our cluster and client
# from dask.distributed import Client as _Client

# the following imports can be useful to examine the optimal number of partitions
#from dask.distributed import progress as _progress
#from dask.diagnostics import ProgressBar
#from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler

scenario related

In [2]:
import gzip as _gzip
import random as _random

import pytz as _pytz
import ujson as _json
from faker import Faker as _Faker

from settings import (
    amount_datasets,
    datasets_files,
    partitions,
    records_per_partition,
)

## definition of required dask client
pass in same scheduler-address as of local cluster

In [3]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:36051  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 16.66 GB


## definition functions to create fake offers datasets
The following function create a random offer using a faker-instance.

In [4]:
def create_fake_offer(fake):
    amount_identifiers = _random.randint(1, 3)
    offer_id = _random.randint(1, 10000000)
    return {
        "id": offer_id,
        "product_id": _random.randint(1, 10000000),
        "category_id": _random.randint(1, 40),
        "name": " ".join(fake.words(nb=_random.randint(2, 4))).title(),
        "brand": "-".join(fake.words(nb=3)).title(),
        "time": fake.past_datetime(start_date="-30d", tzinfo=_pytz.utc).isoformat(),
        "price": _random.randint(100, 10000) / 100,
        "shipping_cost": float(_random.randint(0, 10)),
        "currency": fake.currency_code(),
        "identifiers": {
            "eans": [fake.isbn13() for _ in range(amount_identifiers)],
            "mpnrs": [fake.isbn13() for _ in range(amount_identifiers)],
            "asins": [fake.isbn13() for _ in range(amount_identifiers)],
        },
        "image_url": (
            f"https://{fake.domain_name(2)}/"
            f"{_random.randint(100, 20000)}_{fake.word()}_{fake.word()}"
            ".png"
        ),
        "clickout_url": f"https://dummy.clickout.com/{offer_id}"
    }

The following function create a dataset with the amount of offers as passed to the function.

In [5]:
def create_dataset(amount_of_offers):
    fake = _Faker()
    return (create_fake_offer(fake) for _ in range(amount_of_offers))

## create the offers datasets

In [6]:
def create_datasets(
    npartitions,
    records_per_partition,
    destination
):
    name = f"datasets-{_random.randint(1, 100000000)}"
    _dbag.Bag(
        dsk={
            (name, i): (create_dataset, records_per_partition)
            for i in range(npartitions)
        },
        name=name,
        npartitions=npartitions
    ).map(
        _json.dumps
    ).to_textfiles(
        destination
    )

In [7]:
create_datasets(
    npartitions=amount_datasets,
    records_per_partition=records_per_partition,
    destination=datasets_files,
)

the resulting data is stored as `json` inside the folder `./example`.
One file for each partition.
The structure looks like the following:

In [8]:
!head -n 1 datasets/00.json | jq

[1;39m{
  [0m[34;1m"id"[0m[1;39m: [0m[0;39m7187707[0m[1;39m,
  [0m[34;1m"product_id"[0m[1;39m: [0m[0;39m7828766[0m[1;39m,
  [0m[34;1m"category_id"[0m[1;39m: [0m[0;39m40[0m[1;39m,
  [0m[34;1m"name"[0m[1;39m: [0m[0;32m"Light Case"[0m[1;39m,
  [0m[34;1m"brand"[0m[1;39m: [0m[0;32m"If-Fast-Myself"[0m[1;39m,
  [0m[34;1m"time"[0m[1;39m: [0m[0;32m"2020-10-15T18:20:03+00:00"[0m[1;39m,
  [0m[34;1m"price"[0m[1;39m: [0m[0;39m68.98[0m[1;39m,
  [0m[34;1m"shipping_cost"[0m[1;39m: [0m[0;39m3[0m[1;39m,
  [0m[34;1m"currency"[0m[1;39m: [0m[0;32m"CHF"[0m[1;39m,
  [0m[34;1m"identifiers"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"eans"[0m[1;39m: [0m[1;39m[
      [0;32m"978-0-9618097-3-7"[0m[1;39m,
      [0;32m"978-1-69510-858-5"[0m[1;39m,
      [0;32m"978-0-579-09769-4"[0m[1;39m
    [1;39m][0m[1;39m,
    [0m[34;1m"mpnrs"[0m[1;39m: [0m[1;39m[
      [0;32m"978-0-916551-98-8"[0m[1;39m,
      [0;32m"978-0-515-7118