In [1]:
import os
import time
import urllib.request
from types import ModuleType
from typing import Dict, Optional

import numpy as np
import pandas as pd
from dataquality.utils import tqdm


CWD = os.getcwd()
DATASET = "newsgroups"
DATASET_NUM_CLASSES = 20
BUCKET = "https://galileo-public-tutorial-data.s3.us-west-1.amazonaws.com"
DATASETS = {
    "training": f"{BUCKET}/datasets/original/newsgroups/newsgroups_train.csv",
    "test": f"{BUCKET}/datasets/original/newsgroups/newsgroups_test.csv",
}
TASK_TYPE = "text_classification"
NUM_EPOCHS = 2
BATCH_SIZE = 32
EMB_DIM = 768


def download_dataset_from_aws() -> None:
    for _, url in DATASETS.items():
        fname = os.path.basename(url)
        if os.path.exists(fname):  # Only download if dataset isn't present
            print(f"Dataset already exists {fname}")
        urllib.request.urlretrieve(url, fname)


def load_dataset_split(split: str) -> pd.DataFrame:
    dataset = pd.read_csv(f"{CWD}/{os.path.basename(DATASETS[split])}")
    print(dataset.info(memory_usage="deep"))
    return dataset


def generate_random_embeddings(batch_size: int, emb_dims: int) -> np.ndarray:
    return np.random.rand(batch_size, emb_dims)


def generate_random_probabilities(batch_size: int, num_classes: int) -> np.ndarray:
    probs = np.random.rand(batch_size, num_classes)
    return probs / probs.sum(axis=-1).reshape(-1, 1)  # Normalize to sum to 1


def log_data(dataquality: ModuleType, epochs: Optional[int] = NUM_EPOCHS) -> float:
    download_dataset_from_aws()
    train_dataset = load_dataset_split("training")
    test_dataset = load_dataset_split("test")
    t_start = time.time()
    dataquality.log_input_data(
        text=train_dataset["text"],
        labels=train_dataset["label"],
        ids=train_dataset["id"],
        split="train",
    )
    dataquality.log_input_data(
        text=test_dataset["text"],
        labels=test_dataset["label"],
        ids=test_dataset["id"],
        split="test",
    )
    dataquality.set_labels_for_run(train_dataset["label"].unique())
    print(f"Input logging took {time.time() - t_start} seconds")
    t_start = time.time()
    num_classes = train_dataset["label"].nunique()
    # Simulates model training loop
    for epoch_idx in range(epochs):
        print(f"Epoch {epoch_idx}")
        print("Training")
        for i in tqdm(range(0, len(train_dataset), BATCH_SIZE)):
            batch = train_dataset[i : i + BATCH_SIZE]
            embedding = generate_random_embeddings(len(batch), EMB_DIM)
            probs = generate_random_probabilities(len(batch), num_classes)
            dataquality.log_model_outputs(
                emb=embedding,
                probs=probs,
                split="train",
                epoch=epoch_idx,
                ids=batch["id"],
            )
        print("Testing")
        for i in tqdm(range(0, len(test_dataset), BATCH_SIZE)):
            batch = test_dataset[i : i + BATCH_SIZE]
            embedding = generate_random_embeddings(len(batch), EMB_DIM)
            probs = generate_random_probabilities(len(batch), num_classes)
            dataquality.log_model_outputs(
                emb=embedding,
                probs=probs,
                split="test",
                epoch=epoch_idx,
                ids=batch["id"],
            )
    time_spent = time.time() - t_start
    print(f"Took {time_spent} seconds")
    return time_spent


In [4]:
import dataquality as dq
from datetime import datetime

dq.init(
    project_name="test_IT",
    run_name=f"{DATASET}_{datetime.today()}",
    task_type=TASK_TYPE,
)
log_data(dataquality=dq, epochs=1)
dq.finish()

📡 Retrieving run from existing project, test_IT
🏃‍♂️ Starting run newsgroups_2022-03-15 10:03:58.349892
🛰 Connected to project, test_IT and created new run, newsgroups_2022-03-15 10:03:58.349892.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11314 entries, 0 to 11313
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      11314 non-null  int64 
 1   text    11096 non-null  object
 2   label   11314 non-null  object
dtypes: int64(1), object(2)
memory usage: 14.6 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7532 entries, 0 to 7531
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7532 non-null   int64 
 1   text    7370 non-null   object
 2   label   7532 non-null   object
dtypes: int64(1), object(2)
memory usage: 8.9 MB
None
Exporting input data [########################################] 100.00% elapsed time  :     0.03s =  0.0m =  0.0h
Appendi

  0%|          | 0/354 [00:00<?, ?it/s]



Testing


  0%|          | 0/236 [00:00<?, ?it/s]

Took 3.0233378410339355 seconds
☁️ Uploading Data
Combining batches for upload


  0%|          | 0/354 [00:00<?, ?it/s]

training:   0%|          | 0/3 [00:00<?, ?it/s]

Writing data for upload [########################################] 100.00% elapsed time  :     0.29s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.17s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.39s =  0.0m =  0.0h
 Combining batches for upload


  0%|          | 0/236 [00:00<?, ?it/s]

test:   0%|          | 0/3 [00:00<?, ?it/s]

Writing data for upload [########################################] 100.00% elapsed time  :     0.17s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.15s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.27s =  0.0m =  0.0h
 🧹 Cleaning up
Job default successfully submitted. Results will be available soon at https://console.dev.rungalileo.io/insights?projectId=0b77d7b2-6cf4-4e1d-b1c3-6ab0040aee1a&runId=f4b4646c-f226-4374-bc2d-a344880ae1b4&split=training&taskType=0&activeDepHigh=1&activeDepLow=0


{'project_id': '0b77d7b2-6cf4-4e1d-b1c3-6ab0040aee1a',
 'run_id': 'f4b4646c-f226-4374-bc2d-a344880ae1b4',
 'job_name': 'default',
 'labels': ['rec.autos',
  'comp.sys.mac.hardware',
  'comp.graphics',
  'sci.space',
  'talk.politics.guns',
  'sci.med',
  'comp.sys.ibm.pc.hardware',
  'comp.os.ms-windows.misc',
  'rec.motorcycles',
  'talk.religion.misc',
  'misc.forsale',
  'alt.atheism',
  'sci.electronics',
  'comp.windows.x',
  'rec.sport.hockey',
  'rec.sport.baseball',
  'soc.religion.christian',
  'talk.politics.mideast',
  'talk.politics.misc',
  'sci.crypt'],
 'tasks': None,
 'message': 'Processing dataquality!',
 'link': 'https://console.dev.rungalileo.io/insights?projectId=0b77d7b2-6cf4-4e1d-b1c3-6ab0040aee1a&runId=f4b4646c-f226-4374-bc2d-a344880ae1b4&split=training&taskType=0&activeDepHigh=1&activeDepLow=0'}

In [5]:
from dataquality.clients.api import ApiClient

c = ApiClient()
c.get_run_status()

{'timestamp': '2022-03-15T14:04:37', 'status': 'started', 'message': 'started'}

In [6]:
c.wait_for_run()

Waiting for job...
Done!. Job finished with status finished
