# Uses cases 

See [this issue](https://gitlab.com/recognai-team/biome/biome.explore/-/issues/1) for more info

In [1]:
# First, we will load a fake dataset used for our use cases
# We'll use the biome.text.Dataset for dataset handling

from biome.text import Pipeline, Dataset

fake_ds = Dataset.from_csv("datasets/business.cat.valid.csv")
fake_ds.head()

  from collections import defaultdict, deque, Mapping, Sequence
  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
  from collections import Counter, Iterable
Using custom data configuration default
Reusing dataset csv (/Users/frascuchon/.cache/huggingface/datasets/csv/default-1617f6c5fc944723/0.0.0/49187751790fa4d820300fd4d0707896e5b941f1a9c644652645b866716a4ac4)


Unnamed: 0,label,text
0,Physiotherapie,Bilitza Physiotherapie Benecke
1,Unternehmensberatungen,Kempke Unternehmensberatung
2,Tiefbau,Jürgen Kremer Sietower Bauunternehmen Gmbh
3,Vereine,Kolping Bildungswerk In Der Diözese Augsburg E...
4,Vereine,Mittendrin Lübeck E.v.
5,Unternehmensberatungen,Future Consulting Gmbh
6,Unternehmensberatungen,Fisseler Consulting
7,Maler,Gerhard Kube Lagoni Malereibetrieb Gmbh
8,Restaurants,"Gaststätten, Restaurants - Restaurant Hermes"
9,Hotels,Hotels - Hotel Sonnenbichl Fam. Fügenschuh


In [8]:
# The I'll configure a base client for API comunication

from rubrix.sdk import Client, AuthenticatedClient
from rubrix.sdk.api.text_classification import bulk_records, search_records
from rubrix.sdk.models import *

# api_url = "https://observe-dev.biome.recogn.ai"
api_url = "http://127.0.0.1:8000"
api_key = "ab"

In [9]:
client = Client(base_url=api_url)
client = AuthenticatedClient(
    base_url=api_url, 
    token=api_key,
    timeout=10
)

In [10]:
from typing import Optional


# I have whatever prediction pipeline reciving a text input and geneerate classes and probabilities arrays
def predict(text: str):
    return ["A", "B"], [0.9, 0.1]


def record_from_data(idx: int, data:dict, prediction:Optional[tuple]=None, annotate:bool=True) -> TextClassificationRecord:
    
    record = {
        "id": idx,
        "inputs": { "text" : data["text"]},
        "metadata": { "gold": data["label"], "input": data["text"]},
    }
    
    if annotate:
        record.update({
            "annotation": {
                "agent": "test_ds",
                "labels": [{"class": data["label"]}]
            }
        })
        
    if prediction is not None:
        record.update({
            "prediction": {
               "agent": "predict(text:str)",
               "labels": [{"class":_class, "confidence": confidence} for _class, confidence in zip(*prediction)]
           }
        })
    
    return TextClassificationRecord.from_dict(record)

## Model debugging and development

### *I have a trained model and I want to explore its predictions with a test dataset.*

In [11]:
bulk_records.sync_detailed(client=client, json_body=TextClassificationRecordsBulk(
    name="explore-predictions-test-ds", 
    records=[record_from_data(idx, data, predict(data["text"]), annotate=False) for idx, data in enumerate(fake_ds)]
))

Response(status_code=401, content=b'{"detail":"Could not validate credentials"}', headers=Headers({'date': 'Wed, 03 Mar 2021 13:23:19 GMT', 'server': 'uvicorn', 'www-authenticate': 'Bearer', 'content-length': '43', 'content-type': 'application/json'}), parsed=None)

In [12]:
# search_records.sync(client=client, dataset_id="explor-predictions-test-ds", json_body=TextClassificationQuery())

## Model monitoring and observability

### *I am serving a model and I want to log my prediction into a central place.*

In [13]:
bulk_records.sync_detailed(client=client, json_body=TextClassificationRecordsBulk(
    name="serving-fake-model-predictions-logs", 
    records=[record_from_data(idx, data, predict(data["text"])) for idx, data in enumerate(fake_ds)]
))

Response(status_code=200, content=b'{"dataset":"serving-fake-model-predictions-logs","processed":2000,"failed":0}', headers=Headers({'date': 'Tue, 02 Mar 2021 21:09:13 GMT', 'server': 'uvicorn', 'content-length': '77', 'content-type': 'application/json'}), parsed=BulkResponse(dataset='serving-fake-model-predictions-logs', processed=2000, failed=0, additional_properties={}))

In [14]:
body = BodySearchRecordsClassificationDatasets_DatasetId__SearchPost.from_dict({"query": {"predicted_as": ["A"] }, "sort" :[{"by": "annotated_as", "order":"asc"}]})
search = search_records.sync(client=client, dataset_id="serving-fake-model-predictions-logs", json_body=body)

NameError: name 'BodySearchRecordsClassificationDatasets_DatasetId__SearchPost' is not defined

In [None]:
search.records[:10]

In [15]:
search

NameError: name 'search' is not defined

### *I want to manually provide annotations over these predictions to extract metrics (e.g., production accuracy).*

In [16]:
# We fetch the firt data record and include its annotation

record = search.records[0]
record

NameError: name 'search' is not defined

In [None]:
## Using the same record id assures update the same record without insertions

bulk_records.sync(client=client, json_body=TextClassificationRecordsBulk(
    name="serving-fake-model-predictions-logs", 
    records=[record_from_data(idx=record.id, data={**record.inputs.to_dict(), "label": "Health"})]
))

In [None]:
# Here we can confirm the created annotation
body =BodySearchRecordsClassificationDatasets_DatasetId__SearchPost(query=TextClassificationQuery(annotated_as=["Health"]))
search_records.sync(client=client, dataset_id="serving-fake-model-predictions-logs", json_body=body)

## Annotation

### *I am starting a model from scratch for a new project and I want to manually label training examples.*

In [14]:
bulk_records.sync_detailed(client=client, json_body=TextClassificationRecordsBulk(
    name="fake-annotation-session",
    records=[record_from_data(idx, data, annotate=False) for idx, data in enumerate(fake_ds)]
))

Response(status_code=200, content=b'{"dataset":"fake-annotation-session","processed":2000,"failed":0}', headers=Headers({'date': 'Tue, 02 Mar 2021 21:09:30 GMT', 'server': 'uvicorn', 'content-length': '65', 'content-type': 'application/json'}), parsed=BulkResponse(dataset='fake-annotation-session', processed=2000, failed=0, additional_properties={}))

In [None]:
search = search_records.sync(client=client, dataset_id="fake-annotation-session", json_body=TextClassificationQuery())

In [None]:
search.aggregations.to_dict()

# Token classification

In [15]:
from biome.text import Dataset

ner_ds = Dataset.from_json("datasets/token_classifier.valid.json")
ner_ds.head()

Using custom data configuration default
Reusing dataset json (/Users/frascuchon/.cache/huggingface/datasets/json/default-4bed7bf9261a40a7/0.0.0/fb88b12bd94767cb0cc7eedcd82ea1f402d2162addc03a37e81d4f8dc7313ad9)


Unnamed: 0,text,labels,intent
0,"[play, Fereydoun, Farrokhzad, best, track]","[O, B-artist, I-artist, B-sort, B-music_item]",PlayMusic
1,"[Find, what, movies, are, showing, at, the, ne...","[O, O, B-movie_type, O, O, O, O, B-spatial_rel...",SearchScreeningEvent
2,"[Will, it, be, chillier, in, La, Mesa, ?]","[O, O, O, B-condition_temperature, O, B-city, ...",GetWeather
3,"[add, sam, sparro, to, my, playlist, called, B...","[O, B-artist, I-artist, O, B-playlist_owner, O...",AddToPlaylist
4,"[Can, I, hear, a, Da, Brat, ep, ?]","[O, O, O, O, B-artist, I-artist, B-music_item, O]",PlayMusic
5,"[Rate, Equal, Affections, one, points]","[O, B-object_name, I-object_name, B-rating_val...",RateBook
6,"[What, is, the, Wanda, Group, movie, schedules]","[O, O, O, B-location_name, I-location_name, B-...",SearchScreeningEvent
7,"[Play, some, theme, songs, from, the, fourties]","[O, O, B-music_item, O, O, O, B-year]",PlayMusic
8,"[Include, Sean, Yseult, in, kaitlin's, metal, ...","[O, B-artist, I-artist, O, B-playlist_owner, B...",AddToPlaylist
9,"[Can, you, add, danny, carey, to, my, masters,...","[O, O, O, B-artist, I-artist, O, B-playlist_ow...",AddToPlaylist


In [16]:
labels = {tag[2:] for tags in ner_ds["labels"] for tag in tags if tag != "O"}
print("number of labels:", len(labels))
labels

number of labels: 39


{'album',
 'artist',
 'best_rating',
 'city',
 'condition_description',
 'condition_temperature',
 'country',
 'cuisine',
 'current_location',
 'entity_name',
 'facility',
 'genre',
 'geographic_poi',
 'location_name',
 'movie_name',
 'movie_type',
 'music_item',
 'object_location_type',
 'object_name',
 'object_part_of_series_type',
 'object_select',
 'object_type',
 'party_size_description',
 'party_size_number',
 'playlist',
 'playlist_owner',
 'poi',
 'rating_unit',
 'rating_value',
 'restaurant_name',
 'restaurant_type',
 'served_dish',
 'service',
 'sort',
 'spatial_relation',
 'state',
 'timeRange',
 'track',
 'year'}

In [12]:
from rubrix.sdk.api.token_classification import bulk_records, search_records
from rubrix.sdk import Client, AuthenticatedClient
from rubrix.sdk.models import *

from spacy.gold import offsets_from_biluo_tags, iob_to_biluo

import spacy

nlp = spacy.load("en")

In [13]:


# api_url = "https://observe-dev.biome.recogn.ai"
# api_key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJAcmVjb2duYWkiLCJleHAiOjE2MTQ0NTgzNjl9.PlS29RTTrPMKz0FIWO4Qwk_9U_i1q5ZC_OVHbDqRIaU"

local_client = Client(base_url="http://localhost:8000")
client = AuthenticatedClient(
    base_url=api_url, 
    token=api_key,
    timeout=10
)
client

AuthenticatedClient(base_url='http://127.0.0.1:8000', cookies={}, headers={}, timeout=10, token='ab')

In [14]:
def ner_record_from_data(idx: int, data:dict, annotate:bool=True) -> TokenClassificationRecord:
    
    record = {
        "id": idx,
        "tokens": data["text"],
        "metadata": { "intent": data["intent"], "tags": data["labels"] },
    }
    
    if annotate:
        doc = nlp(" ".join(data["text"]))
        record.update({
            "annotation": {
                "agent": "test_ds",
                "entities": [{"start": start, "end": end, "label": label} for start, end, label in offsets_from_biluo_tags(doc, iob_to_biluo(data["labels"]))]
            }
        })
        
    return TokenClassificationRecord.from_dict(record)

In [15]:
bulk_records.sync_detailed(client=client, json_body=TokenClassificationRecordsBulk(
    name="explore-predictions-ner-ds", 
    records=[ner_record_from_data(idx, data) for idx, data in enumerate(ner_ds)]
))

NameError: name 'ner_ds' is not defined