## End to end examples logging data to Galileo for Text Classification, MLTC, and NER

### For understanding the client and how to get started, see the [Dataquality Demo](./Dataquality-Client-Demo.ipynb)
### Check out the full documentation [here](https://rungalileo.gitbook.io/galileo/getting-started)
### To see real end-to-end notebooks training real ML models, see [here](https://drive.google.com/drive/folders/17-cHuRzXIpWaD8rYwy69RMQr__HiAiDk?usp=sharing)

In [None]:
## Local

import os

os.environ['GALILEO_CONSOLE_URL']="http://localhost:8088"
os.environ["GALILEO_USERNAME"]="user@example.com"
os.environ["GALILEO_PASSWORD"]="Th3secret_"

In [1]:
import dataquality as dq
dq.configure()

Welcome to Galileo v0.5.3!
To skip this prompt in the future, set the following environment variable: GALILEO_CONSOLE_URL
Welcome to Galileo v0.5.3!
To skip this prompt in the future, set the following environment variable: GALILEO_CONSOLE_URL
🔭 Enter the url of your Galileo console
console.dev.rungalileo.io
📡 https://console.dev.rungalileo.io
🔭 Logging you into Galileo

👀 Found auth method email set via env, skipping prompt.
📡 https://console.dev.rungalileo.io
🔭 Logging you into Galileo

👀 Found auth method email set via env, skipping prompt.
📧 Enter your email:galileo@rungalileo.io
🤫 Enter your password:········
🚀 You're logged in to Galileo as galileo@rungalileo.io!
🚀 You're logged in to Galileo as galileo@rungalileo.io!


***Helper function***

In [8]:
from dataquality import config
import pandas as pd
from dataquality.clients.api import ApiClient
from time import sleep


api_client = ApiClient()


def see_results(wait=True, body={}):
    if wait:
        print("Waiting for data to be processed")
        if "localhost" in config.api_url:
            for i in tqdm(range(50)):
                sleep(1)
        else:
            api_client.wait_for_run()

    task_type = dq.config.task_type
    proj = api_client.get_project(config.current_project_id)["name"]
    run = api_client.get_project_run(config.current_project_id, config.current_run_id)["name"]
    api_client.export_run(proj, run, "training", f"{task_type}_training.csv")
    api_client.export_run(proj, run, "test", f"{task_type}_test.csv")
    api_client.export_run(proj, run, "validation", f"{task_type}_validation.csv")
    print(f"Exported to {task_type}_training.csv, {task_type}_test.csv, and {task_type}_validation.csv")
    df_train = pd.read_csv(f"{task_type}_training.csv")
    df_test = pd.read_csv(f"{task_type}_test.csv")
    df_val = pd.read_csv(f"{task_type}_validation.csv")
    print("Training")
    display(df_train)
    print("\nTest")
    display(df_test)
    print("\nValidation")
    display(df_val)
    return df_train, df_test, df_val

## Text Classification

In [9]:
from tqdm.notebook import tqdm
import time
import numpy as np
from uuid import uuid4
import pandas as pd
from sklearn.datasets import fetch_20newsgroups


dq.init("text_classification", "test-tc-run")


BATCH_SIZE=8
EMB_DIM=768
NUM_EPOCHS=3


newsgroups = fetch_20newsgroups(subset="train", remove=('headers', 'footers', 'quotes'))
dataset = pd.DataFrame()
dataset["text"] = newsgroups.data
label_ind = newsgroups.target_names
dataset["label"] = [label_ind[i] for i in newsgroups.target]
dataset["id"] = list(range(len(dataset)))


def generate_random_embeddings(batch_size: int, emb_dims: int) -> np.ndarray:
    return np.random.rand(batch_size, emb_dims)


def generate_random_probabilities(batch_size: int, num_classes: int) -> np.ndarray:
    probs = np.random.rand(batch_size, num_classes)
    return probs / probs.sum(axis=-1).reshape(-1, 1)  # Normalize to sum to 1


t_start = time.time()
dq.set_labels_for_run(dataset["label"].unique())

print("Logging input data")
for split in ["train", "test", "validation"]:
    dq.log_dataset(dataset, split=split)
    
print("Done")
print(f"Input logging took {time.time() - t_start} seconds\n\n")


print("Logging model outputs")
t_start = time.time()
num_classes = dataset["label"].nunique()
# Simulates model training loop
for epoch_idx in range(NUM_EPOCHS):
    print(f"Epoch {epoch_idx}")
    print('-'*100)
    for split in ["train", "test", "validation"]:
        print(split.capitalize())
        dq.set_split(split)
        for i in tqdm(range(0, len(dataset), BATCH_SIZE)):
            batch = dataset[i : i + BATCH_SIZE]
            embeddings = generate_random_embeddings(len(batch), EMB_DIM)
            probs = generate_random_probabilities(len(batch), num_classes)
            dq.log_model_outputs(
                embs=embeddings,
                probs=probs,
                epoch=epoch_idx,
                ids=batch["id"],
            )
    print('-'*100,end="\n\n")
            
print("Done")

time_spent = time.time() - t_start
print(f"Logging output took {time_spent} seconds")

dq.finish()
df_train, df_test, df_val = see_results()

📡 Retrieved project, test-tc-run, and starting a new run
🏃‍♂️ Starting run disturbed_plum_sparrow
📡 Retrieved project, test-tc-run, and starting a new run
🏃‍♂️ Starting run disturbed_plum_sparrow
🛰 Connected to project, test-tc-run, and created run, disturbed_plum_sparrow.
🛰 Connected to project, test-tc-run, and created run, disturbed_plum_sparrow.
Logging input data
Logging input data
Exporting input data [########################################] 100.00% elapsed time  :     0.04s =  0.0m =  0.0h
Exporting input data [########################################] 100.00% elapsed time  :     0.04s =  0.0m =  0.0h
Exporting input data [########################################] 100.00% elapsed time  :     0.01s =  0.0m =  0.0h
Exporting input data [########################################] 100.00% elapsed time  :     0.01s =  0.0m =  0.0h
Exporting input data [########################################] 100.00% elapsed time  :     0.01s =  0.0m =  0.0h
Exporting input data [##################

  0%|          | 0/1415 [00:00<?, ?it/s]

  0%|          | 0/1415 [00:00<?, ?it/s]



Test


  0%|          | 0/1415 [00:00<?, ?it/s]

Test


  0%|          | 0/1415 [00:00<?, ?it/s]

Validation
Validation


  0%|          | 0/1415 [00:00<?, ?it/s]

  0%|          | 0/1415 [00:00<?, ?it/s]

----------------------------------------------------------------------------------------------------

Epoch 1
----------------------------------------------------------------------------------------------------
Train
----------------------------------------------------------------------------------------------------

Epoch 1
----------------------------------------------------------------------------------------------------
Train


  0%|          | 0/1415 [00:00<?, ?it/s]

  0%|          | 0/1415 [00:00<?, ?it/s]

Test


  0%|          | 0/1415 [00:00<?, ?it/s]

Test


  0%|          | 0/1415 [00:00<?, ?it/s]

Validation


  0%|          | 0/1415 [00:00<?, ?it/s]

Validation


  0%|          | 0/1415 [00:00<?, ?it/s]

----------------------------------------------------------------------------------------------------

Epoch 2
----------------------------------------------------------------------------------------------------
Train
----------------------------------------------------------------------------------------------------

Epoch 2
----------------------------------------------------------------------------------------------------
Train


  0%|          | 0/1415 [00:00<?, ?it/s]

  0%|          | 0/1415 [00:00<?, ?it/s]

Test


  0%|          | 0/1415 [00:00<?, ?it/s]

Test


  0%|          | 0/1415 [00:00<?, ?it/s]

Validation


  0%|          | 0/1415 [00:00<?, ?it/s]

Validation


  0%|          | 0/1415 [00:00<?, ?it/s]

----------------------------------------------------------------------------------------------------

Done
Logging output took 81.39275002479553 seconds
----------------------------------------------------------------------------------------------------

Done
Logging output took 81.39275002479553 seconds
☁️ Uploading Data
☁️ Uploading Data


training:   0%|          | 0/3 [00:00<?, ?it/s]

training:   0%|          | 0/3 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

training (epoch=0):   0%|          | 0/3 [00:00<?, ?it/s]

training (epoch=0):   0%|          | 0/3 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

training (epoch=1):   0%|          | 0/3 [00:00<?, ?it/s]

training (epoch=1):   0%|          | 0/3 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

training (epoch=2):   0%|          | 0/3 [00:00<?, ?it/s]

training (epoch=2):   0%|          | 0/3 [00:00<?, ?it/s]

validation:   0%|          | 0/3 [00:00<?, ?it/s]

validation:   0%|          | 0/3 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

validation (epoch=0):   0%|          | 0/3 [00:00<?, ?it/s]

validation (epoch=0):   0%|          | 0/3 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

validation (epoch=1):   0%|          | 0/3 [00:00<?, ?it/s]

validation (epoch=1):   0%|          | 0/3 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

validation (epoch=2):   0%|          | 0/3 [00:00<?, ?it/s]

validation (epoch=2):   0%|          | 0/3 [00:00<?, ?it/s]

test:   0%|          | 0/3 [00:00<?, ?it/s]

test:   0%|          | 0/3 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

test (epoch=0):   0%|          | 0/3 [00:00<?, ?it/s]

test (epoch=0):   0%|          | 0/3 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

test (epoch=1):   0%|          | 0/3 [00:00<?, ?it/s]

test (epoch=1):   0%|          | 0/3 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

Combining batches for upload:   0%|          | 0/1415 [00:00<?, ?it/s]

test (epoch=2):   0%|          | 0/3 [00:00<?, ?it/s]

test (epoch=2):   0%|          | 0/3 [00:00<?, ?it/s]

🧹 Cleaning up
🧹 Cleaning up
Job default successfully submitted. Results will be available soon at https://console.dev.rungalileo.io/insights?projectId=4a8a50d8-9a50-48c0-9c5b-d3f015b775d3&runId=9164f99b-b0dc-47d2-9a8f-59e0b18a6ca2&split=training&depHigh=1&depLow=0&taskType=0
Waiting for job...
Job default successfully submitted. Results will be available soon at https://console.dev.rungalileo.io/insights?projectId=4a8a50d8-9a50-48c0-9c5b-d3f015b775d3&runId=9164f99b-b0dc-47d2-9a8f-59e0b18a6ca2&split=training&depHigh=1&depLow=0&taskType=0
Waiting for job...
Done! Job finished with status completed
Waiting for data to be processed
Waiting for job...
Done! Job finished with status completed
Done! Job finished with status completed
Waiting for data to be processed
Waiting for job...
Done! Job finished with status completed
Exported to text_classification_training.csv, text_classification_test.csv, and text_classification_validation.csv
Exported to text_classification_training.csv, text_clas

Unnamed: 0,epoch,pred,text,split,data_schema_version,id,galileo_text_length,galileo_language_id,galileo_pii,confidence,data_error_potential,gold,likely_mislabeled,x,y
0,2,comp.os.ms-windows.misc,I was wondering if anyone out there could enli...,training,1,0,475,en,,0.104782,0.522242,rec.autos,False,8.798896,6.343746
1,2,sci.space,A fair number of brave souls who upgraded thei...,training,1,1,530,en,,0.086106,0.528448,comp.sys.mac.hardware,True,3.813588,5.010245
2,2,sci.med,"well folks, my mac plus finally gave up the gh...",training,1,2,1659,en,email,0.123137,0.508665,comp.sys.mac.hardware,False,5.970277,7.313612
3,2,sci.med,\nDo you have Weitek's address/phone number? ...,training,1,3,95,en,,0.101609,0.530776,comp.graphics,False,8.134002,2.297700
4,2,comp.windows.x,"From article <C5owCB.n3p@world.std.com>, by to...",training,1,4,448,en,email,0.082827,0.525083,sci.space,False,9.301818,3.057743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,2,comp.graphics,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,training,1,11309,1782,en,email,0.082868,0.524395,sci.med,False,4.356219,6.138403
11310,2,talk.politics.mideast,"I have a (very old) Mac 512k and a Mac Plus, b...",training,1,11310,674,en,email,0.087918,0.514660,comp.sys.mac.hardware,False,4.327601,3.094569
11311,2,comp.sys.mac.hardware,I just installed a DX2-66 CPU in a clone mothe...,training,1,11311,581,en,,0.088605,0.512904,comp.sys.ibm.pc.hardware,False,8.840967,6.353383
11312,2,sci.med,\nWouldn't this require a hyper-sphere. In 3-...,training,1,11312,311,en,,0.114047,0.533124,comp.graphics,True,7.351843,7.585356



Test


Unnamed: 0,epoch,pred,text,split,data_schema_version,id,galileo_text_length,galileo_language_id,galileo_pii,confidence,data_error_potential,gold,likely_mislabeled,x,y
0,2,comp.os.ms-windows.misc,I was wondering if anyone out there could enli...,training,1,0,475,en,,0.104782,0.522242,rec.autos,False,8.798896,6.343746
1,2,sci.space,A fair number of brave souls who upgraded thei...,training,1,1,530,en,,0.086106,0.528448,comp.sys.mac.hardware,True,3.813588,5.010245
2,2,sci.med,"well folks, my mac plus finally gave up the gh...",training,1,2,1659,en,email,0.123137,0.508665,comp.sys.mac.hardware,False,5.970277,7.313612
3,2,sci.med,\nDo you have Weitek's address/phone number? ...,training,1,3,95,en,,0.101609,0.530776,comp.graphics,False,8.134002,2.297700
4,2,comp.windows.x,"From article <C5owCB.n3p@world.std.com>, by to...",training,1,4,448,en,email,0.082827,0.525083,sci.space,False,9.301818,3.057743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,2,comp.graphics,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,training,1,11309,1782,en,email,0.082868,0.524395,sci.med,False,4.356219,6.138403
11310,2,talk.politics.mideast,"I have a (very old) Mac 512k and a Mac Plus, b...",training,1,11310,674,en,email,0.087918,0.514660,comp.sys.mac.hardware,False,4.327601,3.094569
11311,2,comp.sys.mac.hardware,I just installed a DX2-66 CPU in a clone mothe...,training,1,11311,581,en,,0.088605,0.512904,comp.sys.ibm.pc.hardware,False,8.840967,6.353383
11312,2,sci.med,\nWouldn't this require a hyper-sphere. In 3-...,training,1,11312,311,en,,0.114047,0.533124,comp.graphics,True,7.351843,7.585356



Test


Unnamed: 0,epoch,pred,text,split,data_schema_version,id,galileo_text_length,galileo_language_id,galileo_pii,confidence,data_error_potential,gold,likely_mislabeled,x,y
0,2,rec.autos,I was wondering if anyone out there could enli...,test,1,0,475,en,,0.109230,0.509571,rec.autos,False,9.237647,5.236216
1,2,comp.sys.mac.hardware,A fair number of brave souls who upgraded thei...,test,1,1,530,en,,0.091519,0.525536,comp.sys.mac.hardware,False,9.181750,7.500917
2,2,comp.sys.mac.hardware,"well folks, my mac plus finally gave up the gh...",test,1,2,1659,en,email,0.088116,0.511979,comp.sys.mac.hardware,False,8.194591,3.988687
3,2,alt.atheism,\nDo you have Weitek's address/phone number? ...,test,1,3,95,en,,0.097158,0.541950,comp.graphics,True,8.945316,4.731820
4,2,comp.sys.ibm.pc.hardware,"From article <C5owCB.n3p@world.std.com>, by to...",test,1,4,448,en,email,0.103676,0.524611,sci.space,True,9.180300,5.513928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,2,sci.med,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,test,1,11309,1782,en,email,0.101422,0.513205,sci.med,False,6.618761,3.157532
11310,2,sci.space,"I have a (very old) Mac 512k and a Mac Plus, b...",test,1,11310,674,en,email,0.092883,0.531988,comp.sys.mac.hardware,True,4.914928,9.028614
11311,2,soc.religion.christian,I just installed a DX2-66 CPU in a clone mothe...,test,1,11311,581,en,,0.098544,0.519418,comp.sys.ibm.pc.hardware,False,4.657054,8.569967
11312,2,soc.religion.christian,\nWouldn't this require a hyper-sphere. In 3-...,test,1,11312,311,en,,0.104302,0.520857,comp.graphics,False,5.009503,9.082722


Unnamed: 0,epoch,pred,text,split,data_schema_version,id,galileo_text_length,galileo_language_id,galileo_pii,confidence,data_error_potential,gold,likely_mislabeled,x,y
0,2,rec.autos,I was wondering if anyone out there could enli...,test,1,0,475,en,,0.109230,0.509571,rec.autos,False,9.237647,5.236216
1,2,comp.sys.mac.hardware,A fair number of brave souls who upgraded thei...,test,1,1,530,en,,0.091519,0.525536,comp.sys.mac.hardware,False,9.181750,7.500917
2,2,comp.sys.mac.hardware,"well folks, my mac plus finally gave up the gh...",test,1,2,1659,en,email,0.088116,0.511979,comp.sys.mac.hardware,False,8.194591,3.988687
3,2,alt.atheism,\nDo you have Weitek's address/phone number? ...,test,1,3,95,en,,0.097158,0.541950,comp.graphics,True,8.945316,4.731820
4,2,comp.sys.ibm.pc.hardware,"From article <C5owCB.n3p@world.std.com>, by to...",test,1,4,448,en,email,0.103676,0.524611,sci.space,True,9.180300,5.513928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,2,sci.med,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,test,1,11309,1782,en,email,0.101422,0.513205,sci.med,False,6.618761,3.157532
11310,2,sci.space,"I have a (very old) Mac 512k and a Mac Plus, b...",test,1,11310,674,en,email,0.092883,0.531988,comp.sys.mac.hardware,True,4.914928,9.028614
11311,2,soc.religion.christian,I just installed a DX2-66 CPU in a clone mothe...,test,1,11311,581,en,,0.098544,0.519418,comp.sys.ibm.pc.hardware,False,4.657054,8.569967
11312,2,soc.religion.christian,\nWouldn't this require a hyper-sphere. In 3-...,test,1,11312,311,en,,0.104302,0.520857,comp.graphics,False,5.009503,9.082722



Validation

Validation


Unnamed: 0,epoch,pred,text,split,data_schema_version,id,galileo_text_length,galileo_language_id,galileo_pii,confidence,data_error_potential,gold,likely_mislabeled,x,y
0,2,talk.politics.misc,I was wondering if anyone out there could enli...,validation,1,0,475,en,,0.099878,0.525356,rec.autos,False,2.875576,4.870388
1,2,comp.os.ms-windows.misc,A fair number of brave souls who upgraded thei...,validation,1,1,530,en,,0.117667,0.525906,comp.sys.mac.hardware,True,2.915144,6.841124
2,2,talk.politics.guns,"well folks, my mac plus finally gave up the gh...",validation,1,2,1659,en,email,0.099666,0.522602,comp.sys.mac.hardware,True,4.725327,8.048245
3,2,talk.politics.mideast,\nDo you have Weitek's address/phone number? ...,validation,1,3,95,en,,0.094756,0.513553,comp.graphics,False,2.474094,3.836137
4,2,comp.sys.ibm.pc.hardware,"From article <C5owCB.n3p@world.std.com>, by to...",validation,1,4,448,en,email,0.107665,0.518205,sci.space,True,3.399931,2.522562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,2,talk.religion.misc,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,validation,1,11309,1782,en,email,0.098501,0.529139,sci.med,False,6.434455,2.245063
11310,2,talk.religion.misc,"I have a (very old) Mac 512k and a Mac Plus, b...",validation,1,11310,674,en,email,0.106229,0.525302,comp.sys.mac.hardware,True,3.582815,7.394220
11311,2,comp.windows.x,I just installed a DX2-66 CPU in a clone mothe...,validation,1,11311,581,en,,0.123708,0.528538,comp.sys.ibm.pc.hardware,True,3.061371,2.719053
11312,2,talk.politics.misc,\nWouldn't this require a hyper-sphere. In 3-...,validation,1,11312,311,en,,0.098269,0.514426,comp.graphics,False,5.924398,7.675741


Unnamed: 0,epoch,pred,text,split,data_schema_version,id,galileo_text_length,galileo_language_id,galileo_pii,confidence,data_error_potential,gold,likely_mislabeled,x,y
0,2,talk.politics.misc,I was wondering if anyone out there could enli...,validation,1,0,475,en,,0.099878,0.525356,rec.autos,False,2.875576,4.870388
1,2,comp.os.ms-windows.misc,A fair number of brave souls who upgraded thei...,validation,1,1,530,en,,0.117667,0.525906,comp.sys.mac.hardware,True,2.915144,6.841124
2,2,talk.politics.guns,"well folks, my mac plus finally gave up the gh...",validation,1,2,1659,en,email,0.099666,0.522602,comp.sys.mac.hardware,True,4.725327,8.048245
3,2,talk.politics.mideast,\nDo you have Weitek's address/phone number? ...,validation,1,3,95,en,,0.094756,0.513553,comp.graphics,False,2.474094,3.836137
4,2,comp.sys.ibm.pc.hardware,"From article <C5owCB.n3p@world.std.com>, by to...",validation,1,4,448,en,email,0.107665,0.518205,sci.space,True,3.399931,2.522562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,2,talk.religion.misc,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,validation,1,11309,1782,en,email,0.098501,0.529139,sci.med,False,6.434455,2.245063
11310,2,talk.religion.misc,"I have a (very old) Mac 512k and a Mac Plus, b...",validation,1,11310,674,en,email,0.106229,0.525302,comp.sys.mac.hardware,True,3.582815,7.394220
11311,2,comp.windows.x,I just installed a DX2-66 CPU in a clone mothe...,validation,1,11311,581,en,,0.123708,0.528538,comp.sys.ibm.pc.hardware,True,3.061371,2.719053
11312,2,talk.politics.misc,\nWouldn't this require a hyper-sphere. In 3-...,validation,1,11312,311,en,,0.098269,0.514426,comp.graphics,False,5.924398,7.675741


In [4]:
import dataquality as dq

print("disabling galileo")
dq.disable_galileo()
dq.login()

disabling galileo
disabling galileo


## Multi Label

In [None]:
from typing import *
from random import choice
import numpy as np


dq.init("text_multi_label", "test-mltc-run")
dq.set_labels_for_run([["not "+_label, _label] for _label in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult','identity_hate']]) 
dq.set_tasks_for_run(['task_0', 'task_1', 'task_2', 'task_3', 'task_4', 'task_5'])

n = 5000

texts: List[str] = [f"text sample {i}" for i in range(n)]

labels: List[str] = [
    [choice(i) for i in dq.get_data_logger().logger_config.labels]
    for _ in range(n)
]

ids = list(range(n))


dq.log_data_samples(texts=texts, task_labels=labels, ids=ids, split="training")
dq.log_data_samples(texts=texts, task_labels=labels, ids=ids, split="test")
dq.log_data_samples(texts=texts, task_labels=labels, ids=ids, split="validation")

for split in ["train", "test", "validation"]:
    for epoch in range(5):
        emb=np.random.rand(n, 768)
        logits=[[np.random.rand(2)] * 6] * n
        ids=list(range(n))
        
        for i in range(0, n, 32):
            dq.log_model_outputs(
                embs=emb[i:i+5],
                logits=logits[i:i+5],
                ids=ids[i:i+5],
                split=split,
                epoch=epoch
            )

dq.finish()
df_train, df_test, df_val = see_results()


## NER

In [None]:
from dataquality.schemas.task_type import TaskType
from dataquality import config 
from uuid import uuid4
import numpy as np
from time import sleep
from tqdm.notebook import tqdm


dq.init("text_ner", "test-ner-run")


def log_inputs():
    text_inputs = ['what movies star bruce willis', 'show me films with drew barrymore from the 1980s', 'what movies starred both al pacino and robert deniro', 'find me all of the movies that starred harold ramis and bill murray', 'find me a movie with a quote about baseball in it']
    tokens = [[(0, 4), (5, 11), (12, 16), (17, 22), (17, 22), (23, 29), (23, 29)], [(0, 4), (5, 7), (8, 13), (14, 18), (19, 23), (24, 33), (24, 33), (24, 33), (34, 38), (39, 42), (43, 48)], [(0, 4), (5, 11), (12, 19), (20, 24), (25, 27), (28, 34), (28, 34), (28, 34), (35, 38), (39, 45), (39, 45), (46, 52), (46, 52)], [(0, 4), (5, 7), (8, 11), (12, 14), (15, 18), (19, 25), (26, 30), (31, 38), (39, 45), (39, 45), (39, 45), (46, 51), (46, 51), (52, 55), (56, 60), (61, 67), (61, 67), (61, 67)], [(0, 4), (5, 7), (8, 9), (10, 15), (16, 20), (21, 22), (23, 28), (29, 34), (35, 43), (44, 46), (47, 49)]]
    gold_spans = [[{'start': 17, 'end': 29, 'label': 'ACTOR'}], [{'start': 19, 'end': 33, 'label': 'ACTOR'}, {'start': 43, 'end': 48, 'label': 'YEAR'}], [{'start': 25, 'end': 34, 'label': 'ACTOR'}, {'start': 39, 'end': 52, 'label': 'ACTOR'}], [{'start': 39, 'end': 51, 'label': 'ACTOR'}, {'start': 56, 'end': 67, 'label': 'ACTOR'}], []]
    ids = [0, 1, 2, 3, 4]

    labels = ['[PAD]', '[CLS]', '[SEP]', 'O', 'B-ACTOR', 'I-ACTOR', 'B-YEAR', 'B-TITLE', 'B-GENRE', 'I-GENRE', 'B-DIRECTOR', 'I-DIRECTOR', 'B-SONG', 'I-SONG', 'B-PLOT', 'I-PLOT', 'B-REVIEW', 'B-CHARACTER', 'I-CHARACTER', 'B-RATING', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'I-TITLE', 'I-RATING', 'B-TRAILER', 'I-TRAILER', 'I-REVIEW', 'I-YEAR']
    dq.set_labels_for_run(labels)
    dq.set_tagging_schema("BIO")
    dq.log_data_samples(texts=text_inputs, text_token_indices=tokens, ids=ids, gold_spans=gold_spans, split="training")
    dq.log_data_samples(texts=text_inputs, text_token_indices=tokens, ids=ids, gold_spans=gold_spans, split="validation")
    dq.log_data_samples(texts=text_inputs, text_token_indices=tokens, ids=ids, gold_spans=gold_spans, split="test")

def log_outputs():
    num_classes = 28
    embs = [np.random.rand(119, 768) for _ in range(5)]
    logits= [np.random.rand(119, 28) for _ in range(5)]                                      
    ids= list(range(5))
    for epoch in tqdm(range(6)):
        for split in ["training", "test", "validation"]:
            dq.log_model_outputs(
                embs=embs, logits=logits, ids=ids, split=split, epoch=epoch
            )
    
def finish():
    dq.finish()
    
    
def runit():
    log_inputs()
    log_outputs()
    finish()
    
runit()
df_train, df_test, df_val = see_results()