## End to end examples logging data to Galileo for Text Classification, MLTC, and NER

### For understanding the client and how to get started, see the [Dataquality Demo](./Dataquality-Client-Demo.ipynb)
### Check out the full documentation [here](https://rungalileo.gitbook.io/galileo/getting-started)
### To see real end-to-end notebooks training real ML models, see [here](https://drive.google.com/drive/folders/17-cHuRzXIpWaD8rYwy69RMQr__HiAiDk?usp=sharing)

In [3]:
## Local

import os

os.environ['GALILEO_CONSOLE_URL']="http://localhost:8088"
os.environ["GALILEO_USERNAME"]="user@example.com"
os.environ["GALILEO_PASSWORD"]="Th3secret_"

In [4]:
import dataquality as dq
dq.configure()

📡 http://localhost:8088
🔭 Logging you into Galileo

🚀 You're logged in to Galileo as user@example.com!


In [6]:
from dataquality import Condition, AggregateFunction, Operator

dq.init("text_classification", "test-tc-proj", "test-tc-run")

conf_cond = Condition(
    agg=AggregateFunction.avg,
    metric="confidence",
    operator=Operator.lt,
    threshold=0.99,
)
dep_cond = Condition(
    agg=AggregateFunction.max,
    metric="data_error_potential",
    operator=Operator.gt,
    threshold=0.05,
)
dq.register_run_report(conditions=[conf_cond, dep_cond], emails=["echartock3@gmail.com"])

📡 Retrieving run from existing project, test-tc-proj
🛰 Connected to project, test-tc-proj, and run, test-tc-run.




## Text Classification

In [7]:
from tqdm.notebook import tqdm
import time
import numpy as np
from uuid import uuid4
import pandas as pd
from sklearn.datasets import fetch_20newsgroups


BATCH_SIZE=16
EMB_DIM=768
NUM_EPOCHS=1


newsgroups = fetch_20newsgroups(subset="train", remove=('headers', 'footers', 'quotes'))
dataset = pd.DataFrame()
dataset["text"] = newsgroups.data
label_ind = newsgroups.target_names
dataset["label"] = [label_ind[i] for i in newsgroups.target]
dataset["id"] = list(range(len(dataset)))

dataset = dataset[:200]


def generate_random_embeddings(batch_size: int, emb_dims: int) -> np.ndarray:
    return np.random.rand(batch_size, emb_dims)


def generate_random_probabilities(batch_size: int, num_classes: int) -> np.ndarray:
    probs = np.random.rand(batch_size, num_classes)
    return probs / probs.sum(axis=-1).reshape(-1, 1)  # Normalize to sum to 1


t_start = time.time()
dq.set_labels_for_run(dataset["label"].unique())

print("Logging input data")
for split in ["training", "test"]:
    dq.log_dataset(dataset, split=split)
    
print("Done")
print(f"Input logging took {time.time() - t_start} seconds\n\n")


print("Logging model outputs")
t_start = time.time()
num_classes = dataset["label"].nunique()
# Simulates model training loop
for epoch_idx in range(NUM_EPOCHS):
    print(f"Epoch {epoch_idx}")
    print('-'*100)
    for split in ["training", "test"]:
        print(split.capitalize())
        dq.set_split(split)
        for i in tqdm(range(0, len(dataset), BATCH_SIZE)):
            batch = dataset[i : i + BATCH_SIZE]
            embeddings = generate_random_embeddings(len(batch), EMB_DIM)
            probs = generate_random_probabilities(len(batch), num_classes)
            dq.log_model_outputs(
                embs=embeddings,
                probs=probs,
                epoch=epoch_idx,
                ids=batch["id"],
            )
    print('-'*100,end="\n\n")
            
print("Done")

time_spent = time.time() - t_start
print(f"Logging output took {time_spent} seconds")

Logging input data
Logging 200 samples [########################################] 100.00% elapsed time  :     0.00s =  0.0m =  0.0h
Logging 200 samples [########################################] 100.00% elapsed time  :     0.00s =  0.0m =  0.0h
 Done
Input logging took 0.2979912757873535 seconds


Logging model outputs
Epoch 0
----------------------------------------------------------------------------------------------------
Training


  0%|          | 0/13 [00:00<?, ?it/s]

Test




  0%|          | 0/13 [00:00<?, ?it/s]

----------------------------------------------------------------------------------------------------

Done
Logging output took 0.2970759868621826 seconds


In [8]:
dq.finish()

☁️ Uploading Data


training:   0%|          | 0/1 [00:00<?, ?it/s]

Processing data for upload:   0%|          | 0/13 [00:00<?, ?it/s]

training (epoch=0):   0%|          | 0/3 [00:00<?, ?it/s]

Uploading data to Galileo:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/49.6k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/286k [00:00<?, ?B/s]

test:   0%|          | 0/1 [00:00<?, ?it/s]

Processing data for upload:   0%|          | 0/13 [00:00<?, ?it/s]

test (epoch=0):   0%|          | 0/3 [00:00<?, ?it/s]

Uploading data to Galileo:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/49.6k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/286k [00:00<?, ?B/s]

GalileoException: Something didn't go quite right. The api returned a non-ok status code 500 with output: {"detail":"RetryError[<Future at 0x160225040 state=finished raised HTTPException>]"}

## Multi Label

In [None]:
from typing import *
from random import choice
import numpy as np


dq.init("text_multi_label", "test-mltc-run")
dq.set_labels_for_run([["not "+_label, _label] for _label in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult','identity_hate']]) 
dq.set_tasks_for_run(['task_0', 'task_1', 'task_2', 'task_3', 'task_4', 'task_5'])

n = 5000

texts: List[str] = [f"text sample {i}" for i in range(n)]

labels: List[str] = [
    [choice(i) for i in dq.get_data_logger().logger_config.labels]
    for _ in range(n)
]

ids = list(range(n))


dq.log_data_samples(texts=texts, task_labels=labels, ids=ids, split="training")
dq.log_data_samples(texts=texts, task_labels=labels, ids=ids, split="test")
dq.log_data_samples(texts=texts, task_labels=labels, ids=ids, split="validation")

for split in ["training", "test", "validation"]:
    for epoch in range(5):
        emb=np.random.rand(n, 768)
        logits=[[np.random.rand(2)] * 6] * n
        ids=list(range(n))
        
        for i in range(0, n, 32):
            dq.log_model_outputs(
                embs=emb[i:i+5],
                logits=logits[i:i+5],
                ids=ids[i:i+5],
                split=split,
                epoch=epoch
            )

dq.finish()
df_train, df_test, df_val = see_results()


## NER

In [None]:
from dataquality.schemas.task_type import TaskType
from dataquality import config 
from uuid import uuid4
import numpy as np
from time import sleep
from tqdm.notebook import tqdm


dq.init("text_ner", "test-ner-run")


def log_inputs():
    text_inputs = ['what movies star bruce willis', 'show me films with drew barrymore from the 1980s', 'what movies starred both al pacino and robert deniro', 'find me all of the movies that starred harold ramis and bill murray', 'find me a movie with a quote about baseball in it']
    tokens = [[(0, 4), (5, 11), (12, 16), (17, 22), (17, 22), (23, 29), (23, 29)], [(0, 4), (5, 7), (8, 13), (14, 18), (19, 23), (24, 33), (24, 33), (24, 33), (34, 38), (39, 42), (43, 48)], [(0, 4), (5, 11), (12, 19), (20, 24), (25, 27), (28, 34), (28, 34), (28, 34), (35, 38), (39, 45), (39, 45), (46, 52), (46, 52)], [(0, 4), (5, 7), (8, 11), (12, 14), (15, 18), (19, 25), (26, 30), (31, 38), (39, 45), (39, 45), (39, 45), (46, 51), (46, 51), (52, 55), (56, 60), (61, 67), (61, 67), (61, 67)], [(0, 4), (5, 7), (8, 9), (10, 15), (16, 20), (21, 22), (23, 28), (29, 34), (35, 43), (44, 46), (47, 49)]]
    gold_spans = [[{'start': 17, 'end': 29, 'label': 'ACTOR'}], [{'start': 19, 'end': 33, 'label': 'ACTOR'}, {'start': 43, 'end': 48, 'label': 'YEAR'}], [{'start': 25, 'end': 34, 'label': 'ACTOR'}, {'start': 39, 'end': 52, 'label': 'ACTOR'}], [{'start': 39, 'end': 51, 'label': 'ACTOR'}, {'start': 56, 'end': 67, 'label': 'ACTOR'}], []]
    ids = [0, 1, 2, 3, 4]

    labels = ['[PAD]', '[CLS]', '[SEP]', 'O', 'B-ACTOR', 'I-ACTOR', 'B-YEAR', 'B-TITLE', 'B-GENRE', 'I-GENRE', 'B-DIRECTOR', 'I-DIRECTOR', 'B-SONG', 'I-SONG', 'B-PLOT', 'I-PLOT', 'B-REVIEW', 'B-CHARACTER', 'I-CHARACTER', 'B-RATING', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'I-TITLE', 'I-RATING', 'B-TRAILER', 'I-TRAILER', 'I-REVIEW', 'I-YEAR']
    dq.set_labels_for_run(labels)
    dq.set_tagging_schema("BIO")
    dq.log_data_samples(texts=text_inputs, text_token_indices=tokens, ids=ids, gold_spans=gold_spans, split="training")
    dq.log_data_samples(texts=text_inputs, text_token_indices=tokens, ids=ids, gold_spans=gold_spans, split="validation")
    dq.log_data_samples(texts=text_inputs, text_token_indices=tokens, ids=ids, gold_spans=gold_spans, split="test")

def log_outputs():
    num_classes = 28
    embs = [np.random.rand(119, 768) for _ in range(5)]
    logits= [np.random.rand(119, 28) for _ in range(5)]                                      
    ids= list(range(5))
    for epoch in tqdm(range(6)):
        for split in ["training", "test", "validation"]:
            dq.log_model_outputs(
                embs=embs, logits=logits, ids=ids, split=split, epoch=epoch
            )
    
def finish():
    dq.finish()
    
    
def runit():
    log_inputs()
    log_outputs()
    finish()
    
runit()
df_train, df_test, df_val = see_results()