In [1]:
!pip install -q ../../../dataquality

In [1]:
import os

os.environ['GALILEO_API_URL']="http://localhost:8088"
os.environ['GALILEO_MINIO_URL']="127.0.0.1:9000"
os.environ['GALILEO_MINIO_ACCESS_KEY']="minioadmin"
os.environ['GALILEO_MINIO_SECRET_KEY']="minioadmin"
os.environ["GALILEO_AUTH_METHOD"]="email"
os.environ["GALILEO_USERNAME"]="ci@rungalileo.io"
os.environ["GALILEO_PASSWORD"]="CI_user_password!123"

In [2]:
from pathlib import Path

DATASET = "amazon_polarity"
TRAIN_DATASET_NAME = f"{DATASET}_train.csv"
TEST_DATASET_NAME = f"{DATASET}_test.csv"
DATASET_FOLDER_PATH = Path("galileo-ml-train")/"datasets"/"original"/DATASET

cmd = (
    f"aws s3 cp --recursive s3://{DATASET_FOLDER_PATH} ."
)
print(cmd)
os.system(cmd)

aws s3 cp --recursive s3://galileo-ml-train/datasets/original/amazon_polarity .
download: s3://galileo-ml-train/datasets/original/amazon_polarity/amazon_polarity_test.csv to ./amazon_polarity_test.csv
download: s3://galileo-ml-train/datasets/original/amazon_polarity/amazon_polarity_train.csv to ./amazon_polarity_train.csv


0

In [3]:
import dataquality


import pandas as pd

train_dataset = pd.read_csv(DATASET + "_train.csv")
test_dataset = pd.read_csv(DATASET + "_test.csv")

In [4]:
len(train_dataset)

3600000

In [5]:
dataquality.login()
dataquality.init("text_classification", project_name="test_large_dataset", run_name=DATASET)


🔭 Logging you into Galileo

👀 Found auth method email set via env, skipping prompt.
🚀 You're logged in to Galileo as ci@rungalileo.io!
💭 Project test_large_dataset was not found.
✨ Initializing project test_large_dataset
🏃‍♂️ Starting run amazon_polarity
🛰 Created project, test_large_dataset, and new run, amazon_polarity.


In [6]:
import time

t_start = time.time()

dataquality.log_batch_input_data(
    text=train_dataset["text"],
    labels=train_dataset["label"],
    ids=train_dataset["id"],
    split="train"
)

dataquality.log_batch_input_data(
    text=test_dataset["text"],
    labels=test_dataset["label"],
    ids=test_dataset["id"],
    split="test"
)
dataquality.set_labels_for_run(train_dataset["label"].unique())
print(f"Took {time.time() - t_start} seconds")

export(arrow) [########################################] 100.00% elapsed time  :     3.81s =  0.1m =  0.0h
export(arrow) [########################################] 100.00% elapsed time  :     9.79s =  0.2m =  0.0h
 Took 21.876877784729004 seconds


In [7]:
from tqdm.notebook import tqdm
import time
import numpy as np

NUM_EPOCHS=2
BATCH_SIZE=32
EMB_DIM=768


def generate_random_embeddings(batch_size: int, emb_dims: int) -> np.ndarray:
    return np.random.rand(batch_size, emb_dims)


def generate_random_probabilities(batch_size: int, num_classes: int) -> np.ndarray:
    probs = np.random.rand(batch_size, num_classes)
    return probs / probs.sum(axis=-1).reshape(-1, 1)  # Normalize to sum to 1



t_start = time.time()
num_classes = train_dataset["label"].nunique()
# Simulates model training loop
for epoch_idx in range(NUM_EPOCHS):
    print(f"Epoch {epoch_idx}")
    # Train
    print("Training")
    for i in tqdm(range(0, len(train_dataset), BATCH_SIZE)):
        batch = train_dataset[i : i + BATCH_SIZE]

        embedding = generate_random_embeddings(len(batch), EMB_DIM)
        probs = generate_random_probabilities(len(batch), num_classes)

        dataquality.log_model_outputs(
            emb=embedding,
            probs=probs,
            split="train",
            epoch=epoch_idx,
            ids=batch["id"],
        )
    # Test
    print("Testing")
    for i in tqdm(range(0, len(test_dataset), BATCH_SIZE)):
        batch = test_dataset[i : i + BATCH_SIZE]

        embedding = generate_random_embeddings(len(batch), EMB_DIM)
        probs = generate_random_probabilities(len(batch), num_classes)

        dataquality.log_model_outputs(
            emb=embedding,
            probs=probs,
            split="test",
            epoch=epoch_idx,
            ids=batch["id"],
        )
time_spent = time.time() - t_start
print(f"logging took {time_spent} seconds")

t0 = time.time()
dataquality.finish()
t1 = time.time()

print(f"finish/upload took {t1-t0} seconds")

Epoch 0
Training


  0%|          | 0/112500 [00:00<?, ?it/s]

Testing


  0%|          | 0/12500 [00:00<?, ?it/s]

Epoch 1
Training


  0%|          | 0/112500 [00:00<?, ?it/s]

Testing


  0%|          | 0/12500 [00:00<?, ?it/s]

logging took 24641.742566108704 seconds
☁️ Uploading Data


training:   0%|          | 0/3 [00:00<?, ?it/s]

export(hdf5) [########################################] 100.00% elapsed time  :   248.42s =  4.1m =  0.1h
export(hdf5) [########################################] 100.00% elapsed time  :     2.66s =  0.0m =  0.0h
export(hdf5) [########################################] 100.00% elapsed time  :    52.47s =  0.9m =  0.0h    
 

training:   0%|          | 0/3 [00:00<?, ?it/s]

export(hdf5) [########################################] 100.00% elapsed time  :   297.14s =  5.0m =  0.1h
export(hdf5) [########################################] 100.00% elapsed time  :     2.73s =  0.0m =  0.0h
export(hdf5) [########################################] 100.00% elapsed time  :    53.99s =  0.9m =  0.0h  
 

test:   0%|          | 0/3 [00:00<?, ?it/s]

export(hdf5) [########################################] 100.00% elapsed time  :    14.19s =  0.2m =  0.0h
export(hdf5) [########################################] 100.00% elapsed time  :     0.49s =  0.0m =  0.0h
export(hdf5) [########################################] 100.00% elapsed time  :    42.64s =  0.7m =  0.0h  
 

test:   0%|          | 0/3 [00:00<?, ?it/s]

export(hdf5) [########################################] 100.00% elapsed time  :    15.48s =  0.3m =  0.0h
export(hdf5) [########################################] 100.00% elapsed time  :     0.52s =  0.0m =  0.0h
export(hdf5) [########################################] 100.00% elapsed time  :    41.98s =  0.7m =  0.0h  
 🧹 Cleaning up
Job default successfully submitted. Results will be available soon at http://host.docker.internal:3000/projects/446b17d6-bca3-4697-8fb1-d6e01ba1ba4e/runs/0bd9aa64-8109-4ae9-900d-2dcba612a746
finish/upload took 2980.9980731010437 seconds


In [10]:
import dataquality
from dataquality.clients import api_client
from dataquality.core._config import config
from dataquality.schemas import ProcName, RequestType, Route
from dataquality.utils.thread_pool import ThreadPoolManager
from dataquality.utils.version import _version_check

data_logger = dataquality.get_data_logger()
body = dict(
        project_id=str(config.current_project_id),
        run_id=str(config.current_run_id),
        proc_name=ProcName.default.value,
        labels=data_logger.logger_config.labels,
    )
res = api_client.make_request(
    RequestType.POST, url=f"{config.api_url}/{Route.proc_pool}", body=body
)
res

{'project_id': '446b17d6-bca3-4697-8fb1-d6e01ba1ba4e',
 'run_id': '0bd9aa64-8109-4ae9-900d-2dcba612a746',
 'proc_name': 'default',
 'labels': ['positive', 'negative'],
 'tasks': None,
 'message': 'Processing dataquality!',
 'link': 'http://host.docker.internal:3000/projects/446b17d6-bca3-4697-8fb1-d6e01ba1ba4e/runs/0bd9aa64-8109-4ae9-900d-2dcba612a746'}