In [1]:
import os
import time
import urllib.request
from types import ModuleType
from typing import Dict, Optional

import numpy as np
import pandas as pd
from dataquality.utils import tqdm


CWD = os.getcwd()
DATASET = "newsgroups"
DATASET_NUM_CLASSES = 20
BUCKET = "https://galileo-public-tutorial-data.s3.us-west-1.amazonaws.com"
DATASETS = {
    "training": f"{BUCKET}/datasets/original/newsgroups/newsgroups_train.csv",
    "test": f"{BUCKET}/datasets/original/newsgroups/newsgroups_test.csv",
}
TASK_TYPE = "text_classification"
NUM_EPOCHS = 2
BATCH_SIZE = 32
EMB_DIM = 768


def download_dataset_from_aws() -> None:
    for _, url in DATASETS.items():
        fname = os.path.basename(url)
        if os.path.exists(fname):  # Only download if dataset isn't present
            print(f"Dataset already exists {fname}")
        urllib.request.urlretrieve(url, fname)


def load_dataset_split(split: str) -> pd.DataFrame:
    dataset = pd.read_csv(f"{CWD}/{os.path.basename(DATASETS[split])}")
    print(dataset.info(memory_usage="deep"))
    return dataset


def generate_random_embeddings(batch_size: int, emb_dims: int) -> np.ndarray:
    return np.random.rand(batch_size, emb_dims)


def generate_random_probabilities(batch_size: int, num_classes: int) -> np.ndarray:
    probs = np.random.rand(batch_size, num_classes)
    return probs / probs.sum(axis=-1).reshape(-1, 1)  # Normalize to sum to 1


def log_data(dataquality: ModuleType, epochs: Optional[int] = NUM_EPOCHS) -> float:
    download_dataset_from_aws()
    train_dataset = load_dataset_split("training")
    test_dataset = load_dataset_split("test")
    t_start = time.time()
    dataquality.log_input_data(
        text=train_dataset["text"],
        labels=train_dataset["label"],
        ids=train_dataset["id"],
        split="train",
    )
    dataquality.log_input_data(
        text=test_dataset["text"],
        labels=test_dataset["label"],
        ids=test_dataset["id"],
        split="test",
    )
    dataquality.set_labels_for_run(train_dataset["label"].unique())
    print(f"Input logging took {time.time() - t_start} seconds")
    t_start = time.time()
    num_classes = train_dataset["label"].nunique()
    # Simulates model training loop
    for epoch_idx in range(epochs):
        print(f"Epoch {epoch_idx}")
        print("Training")
        for i in tqdm(range(0, len(train_dataset), BATCH_SIZE)):
            batch = train_dataset[i : i + BATCH_SIZE]
            embedding = generate_random_embeddings(len(batch), EMB_DIM)
            probs = generate_random_probabilities(len(batch), num_classes)
            dataquality.log_model_outputs(
                emb=embedding,
                probs=probs,
                split="train",
                epoch=epoch_idx,
                ids=batch["id"],
            )
        print("Testing")
        for i in tqdm(range(0, len(test_dataset), BATCH_SIZE)):
            batch = test_dataset[i : i + BATCH_SIZE]
            embedding = generate_random_embeddings(len(batch), EMB_DIM)
            probs = generate_random_probabilities(len(batch), num_classes)
            dataquality.log_model_outputs(
                emb=embedding,
                probs=probs,
                split="test",
                epoch=epoch_idx,
                ids=batch["id"],
            )
    time_spent = time.time() - t_start
    print(f"Took {time_spent} seconds")
    return time_spent


In [3]:
import dataquality as dq

dq.init(
    project_name="test_IT",
    run_name=f"{DATASET}_{datetime.today()}",
    task_type=TASK_TYPE,
)
log_data(dataquality=dq, epochs=1)
dq.finish()

NameError: name 'datetime' is not defined

In [4]:
from dataquality.clients.api import ApiClient

c = ApiClient()
c.get_run_status()

{'timestamp': '2022-03-15T13:50:08',
 'status': 'finished',
 'message': 'finished'}