# New Galileo manual logging
## Improvements
* Log in batch
* Log numpy arrays and Pytorch Tensors
* Threaded logging execution

In [None]:
import os
os.environ['GALILEO_AUTH_METHOD'] = 'email'

In [None]:
import dataquality

dataquality.login()

dataquality.init()

In [None]:
"""
Create the Newsgroup dataset class. Uses huggingface Bert Tokenizer. 
"""

from sklearn.datasets import fetch_20newsgroups
import torch
from transformers import DistilBertTokenizerFast
import pandas as pd
import numpy as np

# Use the GalileoModelConfig and GalileoDataConfig to keep track of Galileo metrics for logging
from dataquality.core.integrations.config import GalileoModelConfig, GalileoDataConfig

def introduce_label_errors(df, column, shuffle_percent):
    """
    Shuffle X percent of a column in a pandas DataFrame
    """

    arr = df[column].values
    shuffle = np.random.choice(np.arange(arr.shape[0]),
                              round(arr.shape[0] * shuffle_percent / 100),
                              replace=False)
    arr[np.sort(shuffle)] = arr[shuffle]
    df[column] = arr
    return df

class NewsgroupDataset(torch.utils.data.Dataset):
    def __init__(self, split: str):

        newsgroups = fetch_20newsgroups(subset=split, remove=('headers', 'footers', 'quotes'))

        self.dataset = pd.DataFrame()
        self.dataset["text"] = newsgroups.data
        self.dataset["label"] = newsgroups.target
        self.dataset = self.dataset[:23]

        # Shuffle X% of the training dataset to force create mislabeled samples
        if split == "train":
            self.dataset = introduce_label_errors(self.dataset, "label", 10)
            
        # Galileo logging
        _split = 'validation' if split=='test' else split # Consistency with model validation step
        data_config = GalileoDataConfig(text=self.dataset['text'], labels=self.dataset['label'], split=_split)
        dataquality.log_batch_input_data(data_config)


        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
        # string encodings
        self.encodings = tokenizer(self.dataset["text"].tolist(), truncation=True, padding=True)

    def __getitem__(self, idx):
        x = torch.tensor(self.encodings["input_ids"][idx])
        attention_mask = torch.tensor(self.encodings["attention_mask"][idx])
        y = self.dataset["label"][idx]
        return idx, x, attention_mask, y

    def __len__(self):
        return len(self.dataset)


In [None]:
## Step 2: Create a `DistilBERT` model with overloaded `forward()`

"""DistilBERT pytorch class for text classification
"""
import torch
from transformers import DistilBertForSequenceClassification, AdamW, DistilBertConfig, AutoModel
import torch.nn.functional as F

class DistilBERT(torch.nn.Module):
    def __init__(self):
        """
        """
        super(DistilBERT, self).__init__()
        
        self.model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=DistilBertConfig(num_labels=20))
        self.feature_extractor = AutoModel.from_pretrained('distilbert-base-uncased')


    def forward(self, x, attention_mask, x_idxs=None, epoch=None, split=None):
        """

        """
        out = self.model(x, attention_mask=attention_mask)
        log_probs = F.log_softmax(out.logits, dim=1)

        #
        # 🌌 🔭 🤩 Galileo logs output data during forward calls 🌌 🔭 🤩
        #
        probs = F.softmax(out.logits, dim=1)
        encoded_layers = self.feature_extractor(x, return_dict=False)[0]

        # We only want the first set of embeddings from the transformer (@Nikita)
        emb = torch.stack([i[0] for i in encoded_layers])
        model_outputs = GalileoModelConfig(emb=emb, probs=probs, ids=x_idxs, epoch=epoch, split=split)
        dataquality.log_model_outputs(model_outputs)
        
        return log_probs


In [None]:
from tqdm import tqdm
import torchmetrics

num_epochs = 5
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = DistilBERT()
model.to(device)
train_dataloader = torch.utils.data.DataLoader(NewsgroupDataset("train"), batch_size=8, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(NewsgroupDataset("test"), batch_size=8, shuffle=True)

# Another option for logging in pytorch is the following
# with this, you don't need to call dataquality.log_batch_input_data in your DataLoader class
"""
from dataquality.core.integrations.torch import log_input_data
log_input_data(train_dataloader, 'training')
log_input_data(val_dataloader, 'validation')
"""

optimizer = torch.optim.AdamW(
            filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5
        )

train_acc = torchmetrics.Accuracy()
val_acc = torchmetrics.Accuracy()


for epoch in range(2):  
    model.train()
    running_loss = 0.0
    for i, data in enumerate(tqdm(train_dataloader)):
        # get the inputs; data is a list of [inputs, labels]
        # sometimes x_idxs is being set to None... why? @Nikita
        x_idxs, x, attention_mask, y = data
        x = x.to(device)
        attention_mask = attention_mask.to(device)
        y = torch.tensor(y, device=device)

        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        log_probs = model(x, attention_mask, x_idxs=x_idxs, epoch=epoch, split='training')
        loss = F.nll_loss(log_probs, y)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        train_acc(torch.argmax(log_probs.to("cpu"), 1), y.to("cpu"))
        if i % 400 == 0:    # print every 400 mini-batches
            print('[epoch %d, iteration %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 400))
            running_loss = 0.0
            print(f"Train accuracy: {train_acc.compute()}")

    
    with torch.no_grad():
        validation_loss = 0.0
        for i, data in enumerate(tqdm(val_dataloader)):
            x_idxs, x, attention_mask, y = data

            x = x.to(device)
            attention_mask = attention_mask.to(device)
            y = torch.tensor(y, device=device)
            
            log_probs = model(x, attention_mask, epoch=epoch, split='validation', x_idxs=x_idxs)
            loss = F.nll_loss(log_probs, y)

            running_loss += loss.item()
            val_acc(torch.argmax(log_probs.to("cpu"), 1), y.to("cpu"))

        print('[epoch %d] Validation loss: %.3f' %
                  (epoch + 1, running_loss / i))
        print(f"Val accuracy: {val_acc.compute()}")

print('Finished Training')


In [None]:
labels = list(range(model.model.num_labels))
dataquality.set_labels_for_run(labels)

In [None]:
dataquality.finish()

dataquality.config