# Logging an inference run on production data

In this notebook we learn how to log an inference run, demonstrating common flows and errors
If you are new to the dataquality repo, check out the Dataquality-Client-Demo first!

## Setup
In this demo we use the same setup as the Dataquality-Client-Demo.

In [None]:
import os
os.environ['GALILEO_CONSOLE_URL']="http://localhost:8088"

In [None]:
# If you have cloned the dataquality repo and are running this from the docs folder, you can run this
#!pip install -q ../../../../dataquality
import dataquality as dq

***Create an admin if one doesn't exist. Set admin credentials as environment variables to automatically login during `dataquality.init()` below.***

In [None]:
import requests

pwd = "MyPassword!123"

data={
  "email": "me@rungalileo.io",
  "first_name": "Me",
  "last_name": "Me",
  "username": "Galileo",
  "auth_method": "email",
  "password": pwd
}

# This will silently fail with a requests status code of 400 if admin is already set
r = requests.post(f'{dq.config.api_url}/users/admin', json=data)

import os
os.environ["GALILEO_USERNAME"]="me@rungalileo.io"
os.environ["GALILEO_PASSWORD"]=pwd
dq.configure()

We create a few helper functions for creating and logging fake data.

In [None]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pandas as pd

def create_dataset():
    newsgroups = fetch_20newsgroups(subset="train", remove=('headers', 'footers', 'quotes'))
    dataset = pd.DataFrame()
    dataset["text"] = newsgroups.data
    label_ind = newsgroups.target_names
    dataset["label"] = [label_ind[i] for i in newsgroups.target]
    return dataset, label_ind

def fetch_dataset(dataset, split, inference_name = None):
    if split == "training":
        return dataset[:100]
    if split == "test":
        return dataset[100:200]

    if split == "inference":
        if inference_name == "03-14-2022":
            return dataset[200:300]
        if inference_name == "03-21-2022":
            return dataset[300:400]
        if inference_name == "all-customers":
            return dataset[400:500]

    raise ValueError("Uh oh something happened")

# Generate fake model outputs
def log_fake_data(dataset, split):
    dataset_len = len(dataset)

    emb = np.random.rand(dataset_len, 800)
    prob = np.random.rand(dataset_len, 20)
    epochs = [0]
    
    for epoch in epochs:
        ids = dataset.index.to_list()
        dq.log_model_outputs(embs=emb, probs=prob, split=split, epoch=epoch, ids=ids)

## Start with a train / test run

Inference data will usually be logged after training / test runs. We simulate this flow by populating minio with training and test data. 

In [None]:
dq.init(task_type="text_classification", project_name="gonzaga", run_name="duke")

base_dataset, labels = create_dataset()
train_dataset = fetch_dataset(base_dataset, "training")
test_dataset = fetch_dataset(base_dataset, "test")

dq.log_data_samples(texts=train_dataset['text'], labels=train_dataset['label'], split="training", ids=train_dataset.index.to_list())
a = fetch_dataset(base_dataset, "test")
dq.log_data_samples(texts=test_dataset['text'], labels=test_dataset['label'], split="test", ids=test_dataset.index.to_list())

log_fake_data(train_dataset, "training")
log_fake_data(test_dataset, "test")
dq.set_labels_for_run(labels)
dq.finish()

## Inference run

Now log an inference run. Notice that when we log inference data it is appending to Minio, meaning that existing training / test data is not deleted. 

We can log multiple inference runs with different inference names. 

In [None]:
#dq.init(task_type="text_classification", project_name="gonzaga", run_name="duke")
dq.config

In [None]:
split = "inference"
INFERENCE_NAMES = ["03-14-2022", "03-21-2022", "all-customers"]

In [None]:
base_dataset, labels = create_dataset()

In [None]:
week1_dataset = fetch_dataset(base_dataset, split, "03-14-2022")
week2_dataset = fetch_dataset(base_dataset, split, "03-21-2022")
all_dataset = fetch_dataset(base_dataset, split, "all-customers")
datasets = {
    "03-14-2022": week1_dataset,
    "03-21-2022": week2_dataset,
    "all-customers": all_dataset
}
starting_indices = {
    "03-14-2022": 200,
    "03-21-2022": 300,
    "all-customers": 400
}

In [None]:
for inference_name in INFERENCE_NAMES:
    starting_index = starting_indices[inference_name]
    ids = list(range(starting_index, starting_index + 100))
    # Inference doesn't expect labels, but does need an inference name
    dq.log_data_samples(
        texts=datasets[inference_name]["text"],
        split=split,
        inference_name=inference_name,  # could be removed if we only log 1 inference run at a time, would use stingified timestamp
        ids=ids
    )

In [None]:
import numpy as np

def get_model_outputs(data, starting_index):
    num_rows = len(data)
    logits = np.random.rand(num_rows, 20) # fake logits
    embs = np.random.rand(num_rows, 768) # fake embeddings
    ids = list(range(starting_index, starting_index + 100))

    return embs, logits, ids

In [None]:
for inference_name in INFERENCE_NAMES:
    # Set split takes in an optional inference name
    dq.set_split(split, inference_name=inference_name)

    embs, logits, ids = get_model_outputs(datasets[inference_name], starting_indices[inference_name])
    dq.log_model_outputs(embs=embs, logits=logits, ids=ids)

In [None]:
!tree ~/.galileo/logs/{dq.config.current_project_id}/{dq.config.current_run_id}

In [None]:
# Finish will kickoff job with name "inference"
dq.set_labels_for_run(labels)
dq.finish()


## Log a new training run, inference data is wiped

By default, logging a new training or test run wipes all Minio data. We log a new training run and can confirm that all data is wiped in the Minio bucket.

In [None]:
dq.init(task_type="text_classification", project_name="gonzaga", run_name="duke")
base_dataset, labels = create_dataset()
train_dataset = fetch_dataset(base_dataset, "training")
dq.log_data_samples(texts=train_dataset['text'], labels=train_dataset['label'], split="training")
test_dataset = fetch_dataset(base_dataset, "test")
dq.log_data_samples(texts=test_dataset['text'], labels=test_dataset['label'], split="test")

log_fake_data(len(train_dataset), 1)
dq.set_labels_for_run(labels)
dq.finish()

In [None]:
from dataquality.schemas.split import Split
from dataquality.clients.api import ApiClient
import pandas as pd

api_client = ApiClient()
pname, rname = api_client.get_project_run_name()
api_client.export_run(pname, rname, Split.training, "training_data.csv")

pd.read_csv("training_data.csv")