In [None]:
import dataquality as dq

dq.set_console_url("console.erm.rungalileo.io")
dq.login()

## Create unlabeled dataframe

I'm using conv_intent because I dont have unlabeled data, but you can use any df with a `text` column

In [None]:
from datasets import load_dataset
from random import choices
import pandas as pd 


text_samples = load_dataset("rungalileo/conv_intent")["train"]["text"]

some_metadata = ["a", "b", "c", "d", "e", "f", "g"]

unlb_df = pd.DataFrame(dict(
    text=text_samples,
    metadata_1=choices(some_metadata, k=len(text_samples)),
    id=list(range(len(text_samples)))
))
unlb_df.head(5)

## Create pretrained embeddings

In [None]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np

encoder = SentenceTransformer("all-MiniLM-L6-v2")
with torch.autocast("cuda"):
    embs = encoder.encode(unlb_df["text"].tolist(), show_progress_bar=True).astype(np.float32)

## Log data to Galileo

In [None]:
dq.init("text_classification", "unlabeled_data", "conv_intent")

dq.set_split("inference", "inf_1")

# We need some set of labels, or we can make them up
labels = ["foo", "bar"]
dq.set_labels_for_run(labels)

dq.log_dataset(unlb_df, meta=["metadata_1"])
dq.log_model_outputs(
    embs=embs,
    ids=unlb_df["id"].tolist(),
    logits=np.random.rand(len(embs), len(labels))
)
dq.finish(create_data_embs=False)

In [None]:
## Optionally download the data, which will include the cluster_id assigned

df = dq.metrics.get_dataframe("unlabeled_data", "conv_intent", "inference", inference_name="inf_1")