In [1]:
import os
os.environ['GALILEO_CONSOLE_URL']="http://localhost:8088"

In [2]:
# If you have cloned the dataquality repo and are running this from the docs folder, you can run this
#!pip install -q ../../../../dataquality
import dataquality

In [3]:
import requests

pwd = "MyPassword!123"

data={
  "email": "me@rungalileo.io",
  "first_name": "Me",
  "last_name": "Me",
  "username": "Galileo",
  "auth_method": "email",
  "password": pwd
}

r = requests.post(f'{dataquality.config.api_url}/users/admin', json=data)

import os
os.environ["GALILEO_USERNAME"]="me@rungalileo.io"
os.environ["GALILEO_PASSWORD"]=pwd

In [4]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pandas as pd

def create_dataset():
    newsgroups = fetch_20newsgroups(subset="train", remove=('headers', 'footers', 'quotes'))
    dataset = pd.DataFrame()
    dataset["text"] = newsgroups.data
    label_ind = newsgroups.target_names
    dataset["label"] = [label_ind[i] for i in newsgroups.target]
    return dataset, label_ind

def fetch_dataset(dataset, split, inference_name = None):
    if split == "training":
        return dataset[:100]
    if split == "test":
        return dataset[100:200]

    if split == "inference":
        if inference_name == "cool":
            return dataset[200:300]
        if inference_name == "awesome":
            return dataset[300:400]

    raise ValueError("Uh oh something happened")

# Generate fake model outputs
def log_fake_data(dataset_len, log_num: int = 0):
    num_rows = dataset_len // (log_num + 1)

    emb = np.random.rand(num_rows, 800)
    prob = np.random.rand(num_rows, 20)
    for split in ['test','training']:
        epoch = 0
        
        r = range(num_rows*log_num, num_rows*(log_num+1))
        ids = list(r)
        dataquality.log_model_outputs(emb=emb, probs=prob, split=split, epoch=epoch, ids=ids)

In [5]:
dataquality.get_model_logger().logger_config

TextClassificationLoggerConfig(labels=None, tasks=None, observed_num_labels=0, tagging_schema=None, last_epoch=0, cur_epoch=None, cur_split=None, cur_inference_name=None, training_logged=False, validation_logged=False, test_logged=False, inference_logged=False)

## First we run a train / test run

In [6]:
dataquality.init(task_type="text_classification", project_name="gonzaga", run_name="duke")
base_dataset, labels = create_dataset()
train_dataset = fetch_dataset(base_dataset, "training")
dataquality.log_input_data(text=train_dataset['text'], labels=train_dataset['label'], split="training")
test_dataset = fetch_dataset(base_dataset, "test")
dataquality.log_input_data(text=test_dataset['text'], labels=test_dataset['label'], split="test")

log_fake_data(len(train_dataset), 1)
dataquality.set_labels_for_run(labels)
dataquality.finish()

📡 Retrieving run from existing project, gonzaga
🛰 Connected to project, gonzaga, and run, duke.




Exporting input data [########################################] 100.00% elapsed time  :     0.01s =  0.0m =  0.0h
Appending input data [########################################] 100.00% elapsed time  :     0.01s =  0.0m =  0.0h
 



☁️ Uploading Data
Combining batches for upload


  0%|          | 0/1 [00:00<?, ?it/s]

training:   0%|          | 0/3 [00:00<?, ?it/s]

Writing data for upload [########################################] 100.00% elapsed time  :     0.05s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.11s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.19s =  0.0m =  0.0h
 Combining batches for upload


  0%|          | 0/1 [00:00<?, ?it/s]

test:   0%|          | 0/3 [00:00<?, ?it/s]

Writing data for upload [########################################] 100.00% elapsed time  :     0.05s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.10s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.19s =  0.0m =  0.0h
 🧹 Cleaning up
Job default successfully submitted. Results will be available soon at http://127.0.0.1:3000/insights?projectId=18bca69e-ba3e-4b3f-b504-820124538a35&runId=39fa57ff-9c14-4af7-bb35-21efed1cb1a3&split=training&taskType=0&activeDepHigh=1&activeDepLow=0


{'project_id': '18bca69e-ba3e-4b3f-b504-820124538a35',
 'run_id': '39fa57ff-9c14-4af7-bb35-21efed1cb1a3',
 'job_name': 'default',
 'labels': ['alt.atheism',
  'comp.graphics',
  'comp.os.ms-windows.misc',
  'comp.sys.ibm.pc.hardware',
  'comp.sys.mac.hardware',
  'comp.windows.x',
  'misc.forsale',
  'rec.autos',
  'rec.motorcycles',
  'rec.sport.baseball',
  'rec.sport.hockey',
  'sci.crypt',
  'sci.electronics',
  'sci.med',
  'sci.space',
  'soc.religion.christian',
  'talk.politics.guns',
  'talk.politics.mideast',
  'talk.politics.misc',
  'talk.religion.misc'],
 'tasks': None,
 'message': 'Processing dataquality!',
 'link': 'http://127.0.0.1:3000/insights?projectId=18bca69e-ba3e-4b3f-b504-820124538a35&runId=39fa57ff-9c14-4af7-bb35-21efed1cb1a3&split=training&taskType=0&activeDepHigh=1&activeDepLow=0'}

## Now we run an inference run and see that it is appended

In [8]:
dataquality.init(task_type="text_classification", project_name="gonzaga", run_name="duke")

📡 Retrieving run from existing project, gonzaga
🛰 Connected to project, gonzaga, and run, duke.




In [9]:
split = "inference"
INFERENCE_SPLITS = ["cool", "awesome"]

In [10]:
base_dataset, labels = create_dataset()

In [11]:
base_dataset.head(), len(base_dataset)

(                                                text                  label
 0  I was wondering if anyone out there could enli...              rec.autos
 1  A fair number of brave souls who upgraded thei...  comp.sys.mac.hardware
 2  well folks, my mac plus finally gave up the gh...  comp.sys.mac.hardware
 3  \nDo you have Weitek's address/phone number?  ...          comp.graphics
 4  From article <C5owCB.n3p@world.std.com>, by to...              sci.space,
 11314)

In [12]:
cool_dataset = fetch_dataset(base_dataset, split, "cool")
awesome_dataset = fetch_dataset(base_dataset, split, "awesome")
datasets = {
    "cool": cool_dataset,
    "awesome": awesome_dataset
}

In [14]:
awesome_dataset.head()

Unnamed: 0,text,label
300,\nI was at a Cincinnati Cyclones game a year a...,rec.sport.hockey
301,,sci.crypt
302,"Is it possible to do a ""wheelie"" on a motorcyc...",rec.motorcycles
303,"Hello src readers,\n\nAgain the misconception ...",soc.religion.christian
304,\nThere are ALWAYS scalpers with tickets outsi...,rec.sport.hockey


In [15]:
i = 0
for inference_name in INFERENCE_SPLITS:
    ids = list(range(i * 100, i * 100 + 100))
    # Inference doesn't expect labels, but does need an inference name
    dataquality.log_input_data(
        text=datasets[inference_name]["text"],
        split=split,
        inference_name=inference_name,
        ids=ids
    )
    i += 1

Exporting input data [########################################] 100.00% elapsed time  :     0.00s =  0.0m =  0.0h
Appending input data [########################################] 100.00% elapsed time  :     0.01s =  0.0m =  0.0h
 

In [16]:
import numpy as np

def get_model_outputs(data, split, inference_name, i):
    num_rows = len(data)
    logits = np.random.rand(num_rows, 20) # fake logits
    emb = np.random.rand(num_rows, 768) # fake embeddings
    ids = list(range(i * 100, i * 100 + 100))

    return (emb, logits, ids)

In [19]:
i = 0
for inference_name in INFERENCE_SPLITS:
    # Set split takes in an optional inference name
    dataquality.set_split(split, inference_name=inference_name)

    emb, logits, ids = get_model_outputs(cool_dataset, split, inference_name, i)
    dataquality.log_model_outputs(emb=emb, logits=logits, ids=ids, split="inference")
    i += 1

In [20]:
!tree .galileo/logs/{dataquality.config.current_project_id}/{dataquality.config.current_run_id}

[01;34m.galileo/logs/18bca69e-ba3e-4b3f-b504-820124538a35/39fa57ff-9c14-4af7-bb35-21efed1cb1a3[0m
├── [01;34minference[0m
│   ├── [01;34mawesome[0m
│   │   └── [00m03a4d3c6231e.hdf5[0m
│   └── [01;34mcool[0m
│       └── [00m1fcd01fb7326.hdf5[0m
└── [00minput_data.arrow[0m

3 directories, 3 files


In [21]:
# Finish will kickoff job with name "inference"
dataquality.set_labels_for_run(labels)
dataquality.finish()


☁️ Uploading Data
Combining batches for upload


  0%|          | 0/1 [00:00<?, ?it/s]

inference:   0%|          | 0/3 [00:00<?, ?it/s]

Writing data for upload [########################################] 100.00% elapsed time  :     0.05s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.05s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.26s =  0.0m =  0.0h
 Combining batches for upload


  0%|          | 0/1 [00:00<?, ?it/s]

inference:   0%|          | 0/3 [00:00<?, ?it/s]

Writing data for upload [########################################] 100.00% elapsed time  :     0.05s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.06s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.26s =  0.0m =  0.0h
 🧹 Cleaning up
Job default successfully submitted. Results will be available soon at http://127.0.0.1:3000/insights?projectId=18bca69e-ba3e-4b3f-b504-820124538a35&runId=39fa57ff-9c14-4af7-bb35-21efed1cb1a3&split=training&taskType=0&activeDepHigh=1&activeDepLow=0


{'project_id': '18bca69e-ba3e-4b3f-b504-820124538a35',
 'run_id': '39fa57ff-9c14-4af7-bb35-21efed1cb1a3',
 'job_name': 'default',
 'labels': ['alt.atheism',
  'comp.graphics',
  'comp.os.ms-windows.misc',
  'comp.sys.ibm.pc.hardware',
  'comp.sys.mac.hardware',
  'comp.windows.x',
  'misc.forsale',
  'rec.autos',
  'rec.motorcycles',
  'rec.sport.baseball',
  'rec.sport.hockey',
  'sci.crypt',
  'sci.electronics',
  'sci.med',
  'sci.space',
  'soc.religion.christian',
  'talk.politics.guns',
  'talk.politics.mideast',
  'talk.politics.misc',
  'talk.religion.misc'],
 'tasks': None,
 'message': 'Processing dataquality!',
 'link': 'http://127.0.0.1:3000/insights?projectId=18bca69e-ba3e-4b3f-b504-820124538a35&runId=39fa57ff-9c14-4af7-bb35-21efed1cb1a3&split=training&taskType=0&activeDepHigh=1&activeDepLow=0'}

## Now we log another training run to test that inference data is wiped

In [22]:
dataquality.init(task_type="text_classification", project_name="gonzaga", run_name="duke")
base_dataset, labels = create_dataset()
train_dataset = fetch_dataset(base_dataset, "training")
dataquality.log_input_data(text=train_dataset['text'], labels=train_dataset['label'], split="training")
test_dataset = fetch_dataset(base_dataset, "test")
dataquality.log_input_data(text=test_dataset['text'], labels=test_dataset['label'], split="test")

log_fake_data(len(train_dataset), 1)
dataquality.set_labels_for_run(labels)
dataquality.finish()

📡 Retrieving run from existing project, gonzaga
🛰 Connected to project, gonzaga, and run, duke.




Exporting input data [########################################] 100.00% elapsed time  :     0.00s =  0.0m =  0.0h
Appending input data [########################################] 100.00% elapsed time  :     0.01s =  0.0m =  0.0h
 



☁️ Uploading Data
Combining batches for upload


  0%|          | 0/1 [00:00<?, ?it/s]

training:   0%|          | 0/3 [00:00<?, ?it/s]

Writing data for upload [########################################] 100.00% elapsed time  :     0.04s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.10s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.24s =  0.0m =  0.0h
 Combining batches for upload


  0%|          | 0/1 [00:00<?, ?it/s]

test:   0%|          | 0/3 [00:00<?, ?it/s]

Writing data for upload [########################################] 100.00% elapsed time  :     0.05s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.12s =  0.0m =  0.0h
Writing data for upload [########################################] 100.00% elapsed time  :     0.20s =  0.0m =  0.0h
 🧹 Cleaning up
Job default successfully submitted. Results will be available soon at http://127.0.0.1:3000/insights?projectId=18bca69e-ba3e-4b3f-b504-820124538a35&runId=39fa57ff-9c14-4af7-bb35-21efed1cb1a3&split=training&taskType=0&activeDepHigh=1&activeDepLow=0


{'project_id': '18bca69e-ba3e-4b3f-b504-820124538a35',
 'run_id': '39fa57ff-9c14-4af7-bb35-21efed1cb1a3',
 'job_name': 'default',
 'labels': ['alt.atheism',
  'comp.graphics',
  'comp.os.ms-windows.misc',
  'comp.sys.ibm.pc.hardware',
  'comp.sys.mac.hardware',
  'comp.windows.x',
  'misc.forsale',
  'rec.autos',
  'rec.motorcycles',
  'rec.sport.baseball',
  'rec.sport.hockey',
  'sci.crypt',
  'sci.electronics',
  'sci.med',
  'sci.space',
  'soc.religion.christian',
  'talk.politics.guns',
  'talk.politics.mideast',
  'talk.politics.misc',
  'talk.religion.misc'],
 'tasks': None,
 'message': 'Processing dataquality!',
 'link': 'http://127.0.0.1:3000/insights?projectId=18bca69e-ba3e-4b3f-b504-820124538a35&runId=39fa57ff-9c14-4af7-bb35-21efed1cb1a3&split=training&taskType=0&activeDepHigh=1&activeDepLow=0'}