## End to end examples logging data to Galileo for Text Classification, MLTC, and NER

### For understanding the client and how to get started, see the [Dataquality Demo](./Dataquality-Client-Demo.ipynb)
### Check out the full documentation [here](https://rungalileo.gitbook.io/galileo/getting-started)
### To see real end-to-end notebooks training real ML models, see [here](https://drive.google.com/drive/folders/17-cHuRzXIpWaD8rYwy69RMQr__HiAiDk?usp=sharing)

In [1]:
## Local

import os

os.environ['GALILEO_CONSOLE_URL']="http://localhost:8088"
os.environ["GALILEO_USERNAME"]="user@example.com"
os.environ["GALILEO_PASSWORD"]="Th3secret_"

In [2]:
import dataquality as dq
dq.configure()

📡 http://localhost:8088
🔭 Logging you into Galileo

👀 Found auth method email set via env, skipping prompt.
🚀 You're logged in to Galileo as user@example.com!


***Helper function***

In [3]:
from dataquality import config
import pandas as pd
from dataquality.clients.api import ApiClient
from time import sleep


api_client = ApiClient()


def see_results(wait=True, body={}):
    if wait:
        print("Waiting for data to be processed")
        api_client.wait_for_run()

    task_type = dq.config.task_type
    proj = api_client.get_project(config.current_project_id)["name"]
    run = api_client.get_project_run(config.current_project_id, config.current_run_id)["name"]
    api_client.export_run(proj, run, "training", f"{task_type}_training.csv")
    api_client.export_run(proj, run, "test", f"{task_type}_test.csv")
    api_client.export_run(proj, run, "validation", f"{task_type}_validation.csv")
    print(f"Exported to {task_type}_training.csv, {task_type}_test.csv, and {task_type}_validation.csv")
    df_train = pd.read_csv(f"{task_type}_training.csv")
    df_test = pd.read_csv(f"{task_type}_test.csv")
    df_val = pd.read_csv(f"{task_type}_validation.csv")
    print("Training")
    display(df_train)
    print("\nTest")
    display(df_test)
    print("\nValidation")
    display(df_val)
    return df_train, df_test, df_val

## NER

In [4]:
from dataquality.schemas.task_type import TaskType
from dataquality import config 
from uuid import uuid4
import numpy as np
from time import sleep
from tqdm.notebook import tqdm


dq.init("text_ner", "test-ner-proj", "test-ner-run")


def log_inputs():
    text_inputs = ['what movies star bruce willis', 'show me films with drew barrymore from the 1980s', 'what movies starred both al pacino and robert deniro', 'find me all of the movies that starred harold ramis and bill murray', 'find me a movie with a quote about baseball in it']
    tokens = [[(0, 4), (5, 11), (12, 16), (17, 22), (17, 22), (23, 29), (23, 29)], [(0, 4), (5, 7), (8, 13), (14, 18), (19, 23), (24, 33), (24, 33), (24, 33), (34, 38), (39, 42), (43, 48)], [(0, 4), (5, 11), (12, 19), (20, 24), (25, 27), (28, 34), (28, 34), (28, 34), (35, 38), (39, 45), (39, 45), (46, 52), (46, 52)], [(0, 4), (5, 7), (8, 11), (12, 14), (15, 18), (19, 25), (26, 30), (31, 38), (39, 45), (39, 45), (39, 45), (46, 51), (46, 51), (52, 55), (56, 60), (61, 67), (61, 67), (61, 67)], [(0, 4), (5, 7), (8, 9), (10, 15), (16, 20), (21, 22), (23, 28), (29, 34), (35, 43), (44, 46), (47, 49)]]
    gold_spans = [[{'start': 17, 'end': 29, 'label': 'ACTOR'}], [{'start': 19, 'end': 33, 'label': 'ACTOR'}, {'start': 43, 'end': 48, 'label': 'YEAR'}], [{'start': 25, 'end': 34, 'label': 'ACTOR'}, {'start': 39, 'end': 52, 'label': 'ACTOR'}], [{'start': 39, 'end': 51, 'label': 'ACTOR'}, {'start': 56, 'end': 67, 'label': 'ACTOR'}], []]
    ids = [0, 1, 2, 3, 4]

    labels = ['[PAD]', '[CLS]', '[SEP]', 'O', 'B-ACTOR', 'I-ACTOR', 'B-YEAR', 'B-TITLE', 'B-GENRE', 'I-GENRE', 'B-DIRECTOR', 'I-DIRECTOR', 'B-SONG', 'I-SONG', 'B-PLOT', 'I-PLOT', 'B-REVIEW', 'B-CHARACTER', 'I-CHARACTER', 'B-RATING', 'B-RATINGS_AVERAGE', 'I-RATINGS_AVERAGE', 'I-TITLE', 'I-RATING', 'B-TRAILER', 'I-TRAILER', 'I-REVIEW', 'I-YEAR']
    dq.set_labels_for_run(labels)
    dq.set_tagging_schema("BIO")
    # dq.log_data_samples(texts=text_inputs, text_token_indices=tokens, ids=ids, gold_spans=gold_spans, split="training")
    # dq.log_data_samples(texts=text_inputs, text_token_indices=tokens, ids=ids, gold_spans=gold_spans, split="validation")
    dq.log_data_samples(texts=text_inputs, text_token_indices=tokens, ids=ids, split="inference", inference_name="hi")

def log_outputs():
    num_classes = 28
    embs = [np.random.rand(119, 768) for _ in range(5)]
    logits= [np.random.rand(119, 28) for _ in range(5)]                                      
    ids= list(range(5))
    for split in ["inference"]:
        dq.log_model_outputs(
            embs=embs, logits=logits, ids=ids, split=split, inference_name="hi"
        )
    
def finish():
    dq.finish()
    
    
def runit():
    log_inputs()
    log_outputs()
    finish()
    
# runit()
# df_train, df_test, df_val = see_results()

📡 Retrieving run from existing project, test-ner-proj
🛰 Connected to project, test-ner-proj, and run, test-ner-run.




In [5]:
log_inputs()

Logging 5 samples [########################################] 100.00% elapsed time  :     0.00s =  0.0m =  0.0h
 

In [6]:
log_outputs()

In [7]:
finish()

☁️ Uploading Data


inference:   0%|          | 0/1 [00:00<?, ?it/s]

Processing data for upload:   0%|          | 0/1 [00:00<?, ?it/s]

> [0;32m/Users/elliottchartock/Code/dataquality/.venv/lib/python3.9/site-packages/dataquality/loggers/data_logger/text_ner.py[0m(676)[0;36msplit_dataframe[0;34m()[0m
[0;32m    674 [0;31m[0;34m[0m[0m
[0m[0;32m    675 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 676 [0;31m        [0mprob[0m [0;34m=[0m [0mdf_copy[0m[0;34m[[0m[0mprob_cols[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    677 [0;31m        emb_cols = (
[0m[0;32m    678 [0;31m            [0;34m[[0m[0mNERCols[0m[0;34m.[0m[0mid[0m[0;34m.[0m[0mvalue[0m[0;34m][0m [0;32mif[0m [0mprob_only[0m [0;32melse[0m [0;34m[[0m[0mNERCols[0m[0;34m.[0m[0mid[0m[0;34m.[0m[0mvalue[0m[0;34m,[0m [0mNERCols[0m[0;34m.[0m[0memb[0m[0;34m.[0m[0mvalue[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/elliottchartock/Code/dataquality/.venv/lib/python3.9/site-packages/dataquality/loggers/data_logger/text_ner.py[0m(678)[0;36msplit_dataframe[0;34m()[0m
[0;32m    676 [0;31m        [0mprob[0m [0;34m=[0m [0mdf_copy[0m[0;34m[[0m[0mprob_cols[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    677 [0;31m        emb_cols = (
[0m[0;32m--> 678 [0;31m            [0;34m[[0m[0mNERCols[0m[0;34m.[0m[0mid[0m[0;34m.[0m[0mvalue[0m[0;34m][0m [0;32mif[0m [0mprob_only[0m [0;32melse[0m [0;34m[[0m[0mNERCols[0m[0;34m.[0m[0mid[0m[0;34m.[0m[0mvalue[0m[0;34m,[0m [0mNERCols[0m[0;34m.[0m[0memb[0m[0;34m.[0m[0mvalue[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    679 [0;31m        )
[0m[0;32m    680 [0;31m        [0memb[0m [0;34m=[0m [0mdf_copy[0m[0;34m[[0m[0memb_cols[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/elliottchartock/Code/dataquality/.venv/lib/python3.9/site-packages/dataquality/loggers/data_logger/text_ner.py[0m(677)[0;36msplit_dataframe[0;34m()[0m
[0;32m    675 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    676 [0;31m        [0mprob[0m [0;34m=[0m [0mdf_copy[0m[0;34m[[0m[0mprob_cols[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 677 [0;31m        emb_cols = (
[0m[0;32m    678 [0;31m            [0;34m[[0m[0mNERCols[0m[0;34m.[0m[0mid[0m[0;34m.[0m[0mvalue[0m[0;34m][0m [0;32mif[0m [0mprob_only[0m [0;32melse[0m [0;34m[[0m[0mNERCols[0m[0;34m.[0m[0mid[0m[0;34m.[0m[0mvalue[0m[0;34m,[0m [0mNERCols[0m[0;34m.[0m[0memb[0m[0;34m.[0m[0mvalue[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    679 [0;31m        )
[0m


ipdb>  n


> [0;32m/Users/elliottchartock/Code/dataquality/.venv/lib/python3.9/site-packages/dataquality/loggers/data_logger/text_ner.py[0m(680)[0;36msplit_dataframe[0;34m()[0m
[0;32m    678 [0;31m            [0;34m[[0m[0mNERCols[0m[0;34m.[0m[0mid[0m[0;34m.[0m[0mvalue[0m[0;34m][0m [0;32mif[0m [0mprob_only[0m [0;32melse[0m [0;34m[[0m[0mNERCols[0m[0;34m.[0m[0mid[0m[0;34m.[0m[0mvalue[0m[0;34m,[0m [0mNERCols[0m[0;34m.[0m[0memb[0m[0;34m.[0m[0mvalue[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    679 [0;31m        )
[0m[0;32m--> 680 [0;31m        [0memb[0m [0;34m=[0m [0mdf_copy[0m[0;34m[[0m[0memb_cols[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    681 [0;31m        [0;32mreturn[0m [0mprob[0m[0;34m,[0m [0memb[0m[0;34m,[0m [0mdf_copy[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    682 [0;31m[0;34m[0m[0m
[0m


ipdb>  n


> [0;32m/Users/elliottchartock/Code/dataquality/.venv/lib/python3.9/site-packages/dataquality/loggers/data_logger/text_ner.py[0m(681)[0;36msplit_dataframe[0;34m()[0m
[0;32m    679 [0;31m        )
[0m[0;32m    680 [0;31m        [0memb[0m [0;34m=[0m [0mdf_copy[0m[0;34m[[0m[0memb_cols[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 681 [0;31m        [0;32mreturn[0m [0mprob[0m[0;34m,[0m [0memb[0m[0;34m,[0m [0mdf_copy[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    682 [0;31m[0;34m[0m[0m
[0m[0;32m    683 [0;31m    [0;34m@[0m[0mclassmethod[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  emb


#    id    emb
0    0     'array([4.12751441e-02, 2.26012504e-01, 4.893858...
1    1     'array([0.28743579, 0.08962294, 0.61891793, 0.06...
2    2     'array([7.85238202e-01, 5.57790582e-01, 3.697707...
3    3     'array([1.27612893e-01, 6.59336072e-01, 1.998711...
4    4     'array([4.95929053e-01, 3.90360786e-01, 6.374875...
...  ...   ...
15   15    'array([3.83869839e-01, 9.37352625e-01, 4.037045...
16   16    'array([0.46408676, 0.85385739, 0.47763837, 0.01...
17   17    'array([5.63899767e-01, 8.14417589e-01, 9.385170...
18   18    'array([0.71919333, 0.46569847, 0.24211421, 0.29...
19   19    'array([0.100714  , 0.52286334, 0.59620149, 0.70...


ipdb>  prob


#    id    sample_id    split      is_pred    span_start    span_end    pred             inference_name
0    0     0            inference  True       4             5           YEAR             hi
1    1     1            inference  True       1             2           YEAR             hi
2    2     1            inference  True       5             6           REVIEW           hi
3    3     2            inference  True       1             2           RATINGS_AVERAGE  hi
4    4     2            inference  True       2             3           REVIEW           hi
...  ...   ...          ...        ...        ...           ...         ...              ...
15   15    3            inference  True       14            15          REVIEW           hi
16   16    3            inference  True       16            17          GENRE            hi
17   17    4            inference  True       4             5           DIRECTOR         hi
18   18    4            inference  True       5             6      

ipdb>  df_copy


#    data_schema_version    emb                                                  epoch    inference_name    is_pred    pred             sample_id    span_end    span_start    split      id
0    1                      'array([4.12751441e-02, 2.26012504e-01, 4.893858...  nan      hi                True       YEAR             0            5           4             inference  0
1    1                      'array([0.28743579, 0.08962294, 0.61891793, 0.06...  nan      hi                True       YEAR             1            2           1             inference  1
2    1                      'array([7.85238202e-01, 5.57790582e-01, 3.697707...  nan      hi                True       REVIEW           1            6           5             inference  2
3    1                      'array([1.27612893e-01, 6.59336072e-01, 1.998711...  nan      hi                True       RATINGS_AVERAGE  2            2           1             inference  3
4    1                      'array([4.95929053e-01, 3.90360

ipdb>  c


> [0;32m/Users/elliottchartock/Code/dataquality/.venv/lib/python3.9/site-packages/dataquality/loggers/data_logger/base_data_logger.py[0m(221)[0;36mupload_split[0;34m()[0m
[0;32m    219 [0;31m            )
[0m[0;32m    220 [0;31m            [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 221 [0;31m            [0mcls[0m[0;34m.[0m[0mupload_in_out_frames[0m[0;34m([0m[0mobject_store[0m[0;34m,[0m [0min_out_frames[0m[0;34m,[0m [0msplit[0m[0;34m,[0m [0mepoch_or_inf[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    222 [0;31m[0;34m[0m[0m
[0m[0;32m    223 [0;31m    [0;34m@[0m[0mclassmethod[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  c


inference (inf_name=hi):   0%|          | 0/3 [00:00<?, ?it/s]

Uploading data to Galileo:   0%|          | 0.00/133k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

> [0;32m/Users/elliottchartock/Code/dataquality/.venv/lib/python3.9/site-packages/dataquality/core/finish.py[0m(61)[0;36mfinish[0;34m()[0m
[0;32m     59 [0;31m        )
[0m[0;32m     60 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 61 [0;31m    res = api_client.make_request(
[0m[0;32m     62 [0;31m        [0mRequestType[0m[0;34m.[0m[0mPOST[0m[0;34m,[0m [0murl[0m[0;34m=[0m[0;34mf"{config.api_url}/{Route.jobs}"[0m[0;34m,[0m [0mbody[0m[0;34m=[0m[0mbody[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     63 [0;31m    )
[0m


ipdb>  c


Job inference successfully submitted. Results will be available soon at http://127.0.0.1:3000/insights?projectId=86426353-ba08-4b02-995a-7c62c77f1de9&runId=7d079a21-c1a9-45aa-b9c8-99ec384726a7&split=training&depHigh=1&depLow=0&taskType=2
Waiting for job...


GalileoException: It seems your run failed with status failed, error "Failed background job: Inference is not yet support for NER\n\nTraceback (most recent call last):\n  File \"/Users/elliottchartock/Code/runners/.venv/lib/python3.9/site-packages/runners/servicers/vaex_runner_servicer.py\", line 145, in _run\n    job_fn()\n  File \"/Users/elliottchartock/Code/runners/.venv/lib/python3.9/site-packages/ddtrace/tracer.py\", line 920, in func_wrapper\n    return f(*args, **kwargs)\n  File \"/Users/elliottchartock/Code/runners/.venv/lib/python3.9/site-packages/rungalileo/services/vaex/base_vaex.py\", line 1368, in run\n    self.inference(job_request=job_request, df_dir=tempdir)\n  File \"/Users/elliottchartock/Code/runners/.venv/lib/python3.9/site-packages/rungalileo/services/vaex/text_ner.py\", line 856, in inference\n    raise NotImplementedError(\"Inference is not yet support for NER\")\nNotImplementedError: Inference is not yet support for NER\n"