In [1]:
import datetime as dt
import random
from functools import partial

import pandas as pd
import numpy as np
from datazimmer import get_raw_data_path
from faker import Faker

fake = Faker()
Faker.seed(42069)

rng = random.Random(42069)

people_n = 20
dog_n = 35
rel_n = 30
spotted_n = 100
comp_n = 90
photo_n = 50
comp2_n = 30
dog2_n = 10
dotm_limit = 40


def _prefixed(key, prefix):
    return "__".join(filter(None, [prefix, key]))


def get_nested_address(prefix=""):
    _p = partial(_prefixed, prefix=prefix)
    return {
        _p("city"): fake.city(),
        _p("zip"): fake.zipcode(),
        _p("street_address"): " ".join(fake.street_address().split()[:3]),
        _p("building__floor"): int(rng.lognormvariate(2, 1)) + 1,
        _p("building__door"): rng.randint(1, 101),
    }


people_df = (
    pd.DataFrame(
        [
            {
                "cid": f"p-{i+1}",
                "name": fake.name(),
                "dob": fake.date_between(dt.date(1953, 1, 1), dt.date(2003, 1, 1))
                if rng.random() < 0.9
                else None,
            }
            for i in range(people_n)
        ]
        + [
            {
                "cid": f"p-{people_n + 1}",
                "name": "Adam Groff",
                "dob": fake.date_between(dt.date(2003, 1, 1), dt.date(2004, 1, 1)),
            }
        ]
    )
    .set_index("cid")
    .assign(date_of_birth=lambda df: pd.to_datetime(df["dob"]))
    .drop("dob", axis=1)
)


dog_df = (
    pd.DataFrame(
        [
            {
                "cid": f"d-{i+1}",
                "name": fake.first_name_nonbinary(),
                "dob": fake.date_between(dt.date(2010, 1, 1), dt.date(2020, 1, 1)),
                "waist": rng.uniform(8, 44) if rng.random() < 0.85 else None,
                "sex": rng.choice(["male", "female"]),
            }
            for i in range(dog_n)
        ]
        + [
            {
                "cid": f"d-{dog_n + 1}",
                "name": "Madam",
                "dob": fake.date_between(dt.date(2015, 1, 1), dt.date(2016, 1, 1)),
                "waist": 12,
                "sex": "female",
            }
        ]
    )
    .set_index("cid")
    .assign(date_of_birth=lambda df: pd.to_datetime(df["dob"]))
    .drop("dob", axis=1)
)

rel_df = (
    pd.DataFrame(
        [
            {
                "owner_id": rng.choice(people_df.index),
                "dog_id": rng.choice(dog_df.index),
                "since_birth": rng.random() > 0.2,
            }
            for _ in range(rel_n)
        ]
        + [
            {
                "owner_id": f"p-{people_n + 1}",
                "dog_id": f"d-{dog_n + 1}",
                "since_birth": True,
            }
        ]
    )
    .drop_duplicates(subset=["owner_id", "dog_id"])
    .set_index(["owner_id", "dog_id"])
)


places = ["winner", "runner_up", "special_mention"]

comp_df = (
    pd.DataFrame(
        [
            {
                "competition_id": f"c-{i+1}",
                "prize_pool": rng.randint(1, 30) * 500,
                **{
                    f"{place}__{prefix}__{ind_id}": val
                    for place in places
                    for prefix, ind_id, val in zip(
                        ["owner", "pet"],
                        [people_df.index.name, dog_df.index.name],
                        random.choice(rel_df.index),
                    )
                },
            }
            for i in range(comp_n)
        ]
    )
    .set_index("competition_id")
    .pipe(
        lambda df: pd.concat(
            [
                df,
                pd.DataFrame(
                    np.sort(np.random.rand(comp_n, len(places)), axis=1),
                    columns=[f"{place}__prize" for place in places[::-1]],
                    index=df.index,
                )
                .pipe(
                    lambda _df: _df
                    / _df.sum(axis=1).to_frame().values
                    * np.random.rand(comp_n, 1)
                    * df[["prize_pool"]].values
                )
                .astype(int),
            ],
            axis=1,
        )
    )
)

spotted_df = pd.DataFrame(
    [
        {
            "dog_1__cid": rng.choice(dog_df.index),
            "dog_2__cid": rng.choice(dog_df.index),
            **get_nested_address("place"),
        }
        for _ in range(spotted_n)
    ]
).set_index(
    "dog_1__cid"
)  # only set so that to_csv does not add new col

sizes_df = pd.DataFrame(
    {
        "dogsize_name": ["XS", "SM", "MED", "LG", "XL"],
        "waist_limit__min": [8, 16, 18, 24, 27],
        "waist_limit__max": [22, 27, 34, 39, 50],
        "weight_limit__min": [10, 20, 40, 75, 90],
        "weight_limit__max": [20, 45, 80, 100, 140],
    }
).set_index("dogsize_name")

dog2_df = (
    # maybe add coreferences from ds1
    pd.DataFrame(
        [
            {
                "cid": f"d-{i+1}",
                "name": fake.first_name_female(),
                "sex": rng.choice(["male", "female"]),
                "date_of_birth": fake.date_between(
                    dt.date(2008, 1, 1), dt.date(2021, 1, 1)
                ),
                "size__dogsize_name": rng.choice(sizes_df.index),
                "color": fake.color_name() if rng.random() < 0.8 else None,
            }
            for i in range(dog2_n)
        ]
    ).set_index("cid")
)

race_df = (
    pd.DataFrame(
        [
            {
                "competition_id": f"cx-{i+1}",
                "held_date": fake.date_time_between(
                    dt.date(2019, 1, 1), dt.date(2021, 8, 1)
                ),
                "fastest_time": rng.lognormvariate(6, 1),
                "champion__cid": rng.choice(dog2_df.index),
            }
            for i in range(comp2_n)
        ]
    )
    .set_index("competition_id")
    .assign(held_date=lambda df: df["held_date"].dt.round("1h"))
)

dotm_ind = ["dog_type__pure", "dog_type__neutered", "year", "month"]
dog_of_the_month_df = (
    pd.DataFrame(
        [
            {
                "winner__cid": rng.choice(dog2_df.index),
                "dog_type__pure": rng.random() > 0.2,
                "dog_type__neutered": rng.random() > 0.6,
                "year": rng.randint(2002, 2021),
                "month": rng.randint(1, 12),
            }
            for _ in range(dotm_limit)
        ]
    )
    .drop_duplicates(subset=dotm_ind)
    .set_index(dotm_ind)
    .sort_index()
)

In [None]:
for k, v in [*globals().items()]:
    if k.endswith("_df"):
        fname = k.split("_df")[0] + ".csv"
        v.to_csv(get_raw_data_path(fname))