In [None]:
"""
Part 0. Log in to Galileo!
"""

import dataquality

dataquality.login()

In [None]:
dataquality.config

In [None]:
"""
Part 0.1 Create your first project!
"""

dataquality.init()

In [None]:
dataquality.config

In [None]:
"""
Part 0.2 Install some dependencies for this workflow exercise.
"""

%pip install torch sklearn transformers pandas numpy pytorch_lightning torchmetrics

In [None]:
"""
Part 1.

Log your datasets with Galileo.

Create the Newsgroup dataset class. Using huggingface Bert Tokenizer.

We are introducing some noise to these datasets because 
the newsgroup dataset is already well labeled.
"""

import torch
from sklearn.datasets import fetch_20newsgroups
from transformers import DistilBertTokenizerFast
import pandas as pd
import numpy as np


def introduce_label_errors(df: pd.DataFrame, column: str, shuffle_percent: int) -> pd.DataFrame:
    arr = df[column].values
    shuffle = np.random.choice(
        np.arange(arr.shape[0]), 
        round(arr.shape[0] * shuffle_percent / 100), 
        replace=False)
    arr[np.sort(shuffle)] = arr[shuffle]
    df[column] = arr
    return df
    

class NewsgroupDataset(torch.utils.data.Dataset):
    def __init__(self, split: str) -> None:
        newsgroups = fetch_20newsgroups(subset="train" if split == "training" else "test", 
                                        remove=('headers', 'footers', 'quotes'))

        self.dataset = pd.DataFrame()
        self.dataset["text"] = newsgroups.data
        self.dataset["label"] = newsgroups.target
        self.dataset = self.dataset[:23]

        # Shuffle some percentage of the training dataset 
        # to force create mislabeled samples
        if split == "training":
            self.dataset = introduce_label_errors(self.dataset, "label", 11)

        #
        # 🔭 Logging Inputs with Galileo!
        #
        for i in range(len(self.dataset)):
            dataquality.log_input_data({
                "id": i,
                "text": self.dataset["text"][i],
                "gold": str(self.dataset["label"][i]),
                "split": split})

        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
        self.encodings = tokenizer(self.dataset["text"].tolist(), truncation=True, padding=True)
    
    def __getitem__(self, idx):
        x = torch.tensor(self.encodings["input_ids"][idx])
        attention_mask = torch.tensor(self.encodings["attention_mask"][idx])
        y = self.dataset["label"][idx]
        return idx, x, attention_mask, y

    def __len__(self):
        return len(self.dataset)

In [None]:
"""
Part 2.

Log model outputs with Galileo.

We are using a DistilBERT pytorch lightning class for text classification.
"""

import pytorch_lightning as pl
from transformers import DistilBertForSequenceClassification, AdamW, DistilBertConfig, AutoModel
import torch.nn.functional as F
import torchmetrics


class LightningDistilBERT(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=DistilBertConfig(num_labels=20))
        self.feature_extractor = AutoModel.from_pretrained('distilbert-base-uncased')
        self.train_acc = torchmetrics.Accuracy()
        self.val_acc = torchmetrics.Accuracy()
        self.test_acc = torchmetrics.Accuracy()

    def forward(self, x, attention_mask, x_idxs, epoch, split):
        out = self.model(x, attention_mask=attention_mask)
        log_probs = F.log_softmax(out.logits, dim=1)
        probs = F.softmax(out.logits, dim=1)
        encoded_layers = self.feature_extractor(x, return_dict=False)[0]
        if x_idxs is not None:
            for i in range(len(x_idxs)):
                index = int(x_idxs[i])
                prob = probs[i].detach().cpu().numpy().tolist()
                emb = encoded_layers[i, 0].detach().cpu().numpy().tolist()
                #
                # 🔭 Logging outputs with Galileo!
                #
                dataquality.log_model_output({
                    "id": int(x_idxs[i]),
                    "epoch": epoch,
                    "split": split,
                    "emb": emb,
                    "prob": prob,
                    "pred": str(int(np.argmax(prob)))})
        return log_probs

    def training_step(self, batch, batch_idx):
        """Model training step."""
        x_idxs, x, attention_mask, y = batch
        log_probs = self(x=x, attention_mask=attention_mask, x_idxs=x_idxs, epoch=self.current_epoch, split="training")
        loss = F.nll_loss(log_probs, y)
        self.train_acc(torch.argmax(log_probs, 1), y)
        self.log("train_acc", self.train_acc, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        """Model validation step."""
        x_idxs, x, attention_mask, y = batch
        log_probs = self(x=x, attention_mask=attention_mask, x_idxs=x_idxs, epoch=self.current_epoch, split="validation")
        loss = F.nll_loss(log_probs, y)
        self.val_acc(torch.argmax(log_probs, 1), y)
        self.log("val_acc", self.val_acc, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx): 
        """Model test step."""
        x_idxs, x, attention_mask, y = batch
        log_probs = self(x=x, attention_mask=attention_mask, x_idxs=x_idxs, epoch=self.current_epoch, split="test")
        loss = F.nll_loss(log_probs, y)
        self.test_acc(torch.argmax(log_probs, 1), y)
        self.log("test_acc", self.test_acc, prog_bar=True)
        return loss

    def configure_optimizers(self):
        """Model optimizers."""
        return torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=1e-5)

In [None]:
"""
Part 3.

Instantiate a model and train it with PyTorch Lightning.
"""

model = LightningDistilBERT()

train_dataloader = torch.utils.data.DataLoader(NewsgroupDataset("training"), batch_size=8, shuffle=True)
validation_dataloader = torch.utils.data.DataLoader(NewsgroupDataset("validation"), batch_size=8, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(NewsgroupDataset("test"), batch_size=8, shuffle=True)

trainer = pl.Trainer(max_epochs=2, num_sanity_val_steps=0)

trainer.fit(model, train_dataloader, validation_dataloader)
trainer.test(model, test_dataloader)

In [None]:
dataquality.finish()