In [1]:
"""
Part 0. Log in to Galileo!
"""

import dataquality

dataquality.login()

🔭 Logging you into Galileo

🔐 How would you like to login? 
Enter one of the following: email
email
🤝 Saving preferred login method
🚀 You're logged in to Galileo as anthony@rungalileo.io!


In [2]:
dataquality.config

Config(api_url='http://localhost:8000', minio_url='127.0.0.1:9000', minio_access_key='minioadmin', minio_secret_key='minioadmin', minio_region='us-east-1', auth_method=<AuthMethod.email: 'email'>, token='eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJhbnRob255QHJ1bmdhbGlsZW8uaW8iLCJleHAiOjE2MzY4MzMyNjR9.fhltRJ9hVj-AiPyaPXrv5KZCUw_KU18KSbd-jUPRSs0', current_user='anthony@rungalileo.io', current_project_id=UUID('f3693488-f956-403f-8f69-cfdd1f1a858f'), current_run_id=UUID('791c49c8-b7ef-4ce6-91f3-42851455c153'), labels=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19'], observed_num_labels=20)

In [3]:
"""
Part 0.1 Create your first project!
"""

dataquality.init()

✨ Initializing project drab_fuchsia_krill
🏃‍♂️ Starting run occasional_purple_aardwolf
🛰 Created project, drab_fuchsia_krill, and new run, occasional_purple_aardwolf.


In [4]:
dataquality.config

Config(api_url='http://localhost:8000', minio_url='127.0.0.1:9000', minio_access_key='minioadmin', minio_secret_key='minioadmin', minio_region='us-east-1', auth_method=<AuthMethod.email: 'email'>, token='eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJhbnRob255QHJ1bmdhbGlsZW8uaW8iLCJleHAiOjE2MzY4MzMyNjR9.fhltRJ9hVj-AiPyaPXrv5KZCUw_KU18KSbd-jUPRSs0', current_user='anthony@rungalileo.io', current_project_id='cb5b1a63-7716-4f7e-9c1b-8e2b07b05e19', current_run_id='c6774ab2-35bd-41d0-be16-abc9c22c65fb', labels=None, observed_num_labels=20)

In [5]:
"""
Part 0.2 Install some dependencies for this workflow exercise.
"""

%pip install -q torch sklearn transformers pandas numpy pytorch_lightning torchmetrics

Note: you may need to restart the kernel to use updated packages.


In [6]:
"""
Part 1.

Log your datasets with Galileo.

Create the Newsgroup dataset class. Using huggingface Bert Tokenizer.

We are introducing some noise to these datasets because 
the newsgroup dataset is already well labeled.
"""

import torch
from sklearn.datasets import fetch_20newsgroups
from transformers import DistilBertTokenizerFast
import pandas as pd
import numpy as np

#
# 🔭 Use the GalileoModelConfig and GalileoDataConfig to keep track of Galileo metrics for logging
#
from dataquality.core.integrations.config import GalileoModelConfig, GalileoDataConfig


def introduce_label_errors(df: pd.DataFrame, column: str, shuffle_percent: int) -> pd.DataFrame:
    arr = df[column].values
    shuffle = np.random.choice(
        np.arange(arr.shape[0]), 
        round(arr.shape[0] * shuffle_percent / 100), 
        replace=False)
    arr[np.sort(shuffle)] = arr[shuffle]
    df[column] = arr
    return df
    

class NewsgroupDataset(torch.utils.data.Dataset):
    def __init__(self, split: str) -> None:
        newsgroups = fetch_20newsgroups(subset="train" if split == "training" else "test", 
                                        remove=('headers', 'footers', 'quotes'))

        self.dataset = pd.DataFrame()
        self.dataset["text"] = newsgroups.data
        self.dataset["label"] = newsgroups.target
        self.dataset = self.dataset[:17]
        
        # Shuffle some percentage of the training dataset 
        # to force create mislabeled samples
        if split == "training":
            self.dataset = introduce_label_errors(self.dataset, "label", 11)

        #
        # 🔭 Logging Inputs with Galileo!
        #
        self.gconfig = GalileoDataConfig(text=self.dataset['text'], labels=self.dataset['label'])

        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
        self.encodings = tokenizer(self.dataset["text"].tolist(), truncation=True, padding=True)
    
    def __getitem__(self, idx):
        x = torch.tensor(self.encodings["input_ids"][idx])
        attention_mask = torch.tensor(self.encodings["attention_mask"][idx])
        y = self.dataset["label"][idx]
        return idx, x, attention_mask, y

    def __len__(self):
        return len(self.dataset)

In [7]:
"""
Part 2.

Log model outputs with Galileo.

We are using a DistilBERT pytorch lightning class for text classification.
"""

import pytorch_lightning as pl
from transformers import DistilBertForSequenceClassification, AdamW, DistilBertConfig, AutoModel
import torch.nn.functional as F
import torchmetrics


class LightningDistilBERT(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.model = DistilBertForSequenceClassification.from_pretrained(
            'distilbert-base-uncased', config=DistilBertConfig(num_labels=20))
        self.feature_extractor = AutoModel.from_pretrained('distilbert-base-uncased')
        self.train_acc = torchmetrics.Accuracy()
        self.val_acc = torchmetrics.Accuracy()
        self.test_acc = torchmetrics.Accuracy()

    def forward(self, x, attention_mask, x_idxs, epoch, split):
        out = self.model(x, attention_mask=attention_mask)
        log_probs = F.log_softmax(out.logits, dim=1)
        probs = F.softmax(out.logits, dim=1)
        encoded_layers = self.feature_extractor(x, return_dict=False)[0]
        
        #
        # 🔭 Logging model outputs with Galileo!
        #
        self.g_model_config = GalileoModelConfig(
            emb=[i[0] for i in encoded_layers.tolist()], 
            probs=probs.tolist(), 
            ids=x_idxs.tolist())

        return log_probs

    def training_step(self, batch, batch_idx):
        """Model training step."""
        x_idxs, x, attention_mask, y = batch
        log_probs = self(x=x, attention_mask=attention_mask, x_idxs=x_idxs, epoch=self.current_epoch, 
                         split="training")
        loss = F.nll_loss(log_probs, y)
        self.train_acc(torch.argmax(log_probs, 1), y)
        self.log("train_acc", self.train_acc, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        """Model validation step."""
        x_idxs, x, attention_mask, y = batch
        log_probs = self(x=x, attention_mask=attention_mask, x_idxs=x_idxs, epoch=self.current_epoch, 
                         split="validation")
        loss = F.nll_loss(log_probs, y)
        self.val_acc(torch.argmax(log_probs, 1), y)
        self.log("val_acc", self.val_acc, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx): 
        """Model test step."""
        x_idxs, x, attention_mask, y = batch
        log_probs = self(x=x, attention_mask=attention_mask, x_idxs=x_idxs, epoch=self.current_epoch, 
                         split="test")
        loss = F.nll_loss(log_probs, y)
        self.test_acc(torch.argmax(log_probs, 1), y)
        self.log("test_acc", self.test_acc, prog_bar=True)
        return loss

    def configure_optimizers(self):
        """Model optimizers."""
        return torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=1e-5)

In [8]:
"""
Part 3.

Instantiate a model and train it with PyTorch Lightning.
"""

# Use the PyTorch Lightning Callback to log data to Galileo
from dataquality.core.integrations.lightning import DataQualityCallback

model = LightningDistilBERT()

train_dataloader = torch.utils.data.DataLoader(NewsgroupDataset("training"), batch_size=8, shuffle=True)
validation_dataloader = torch.utils.data.DataLoader(NewsgroupDataset("validation"), batch_size=8, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(NewsgroupDataset("test"), batch_size=8, shuffle=True)

trainer = pl.Trainer(max_epochs=2, num_sanity_val_steps=0, callbacks=[(DataQualityCallback())])

trainer.fit(model, train_dataloader, validation_dataloader)
trainer.test(model, test_dataloader)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: -1it [00:00, ?it/s]

  member_class = getattr(cls, attr)


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.0}
--------------------------------------------------------------------------------


[{'test_acc': 0.0}]

In [9]:
dataquality.set_labels_for_run(list(map(str, range(model.model.num_labels))))

In [10]:
dataquality.finish()

☁️ Uploading Data
🧹 Cleaning up
Job default successfully submitted.


{'project_id': 'cb5b1a63-7716-4f7e-9c1b-8e2b07b05e19',
 'run_id': 'c6774ab2-35bd-41d0-be16-abc9c22c65fb',
 'job_name': 'default',
 'job_env_vars': {'GALILEO_LABELS': ['0',
   '1',
   '2',
   '3',
   '4',
   '5',
   '6',
   '7',
   '8',
   '9',
   '10',
   '11',
   '12',
   '13',
   '14',
   '15',
   '16',
   '17',
   '18',
   '19'],
  'GALILEO_PROJECT_ID': 'cb5b1a63-7716-4f7e-9c1b-8e2b07b05e19',
  'GALILEO_RUN_ID': 'c6774ab2-35bd-41d0-be16-abc9c22c65fb',
  'GALILEO_ROOT_BUCKET_NAME': 'galileo-project-runs',
  'GALILEO_ROOT_BUCKET_NAME_RESULTS': 'galileo-project-runs-results',
  'GALILEO_MINIO_REGION': 'us-east-1',
  'GALILEO_MINIO_K8S_SVC_ADDR': 'http://minio.galileo:9000',
  'GALILEO_MINIO_ACCESS_KEY_ID': 'minioadmin',
  'GALILEO_MINIO_SECRET_ACCESS_KEY': 'minioadmin',
  'GALILEO_MINIO_DISABLE_SSL': True}}