In [1]:
!pip install -q transformers torchmetrics

In [2]:
import dataquality

dataquality.login()

🔭 Logging you into Galileo

🔐 How would you like to login? 
Enter one of the following: email
email
🚀 You're logged in to Galileo as ben@rungalileo.io!


In [3]:
dataquality.init()

✨ Initializing project inc_scarlet_ptarmigan
🏃‍♂️ Starting run appropriate_black_crawdad
🛰 Created project, inc_scarlet_ptarmigan, and new run, appropriate_black_crawdad.


## Step 1: Inject Galileo Logger into the DataLoader

In [4]:
"""
Create the Newsgroup dataset class. Uses huggingface Bert Tokenizer. 
"""

from sklearn.datasets import fetch_20newsgroups
import torch
from transformers import DistilBertTokenizerFast
import pandas as pd
import numpy as np

# Use the GalileoModelConfig and GalileoDataConfig to keep track of Galileo metrics for logging
from dataquality.core.integrations.config import GalileoModelConfig, GalileoDataConfig

def introduce_label_errors(df, column, shuffle_percent):
    """
    Shuffle X percent of a column in a pandas DataFrame
    """

    arr = df[column].values
    shuffle = np.random.choice(np.arange(arr.shape[0]),
                              round(arr.shape[0] * shuffle_percent / 100),
                              replace=False)
    arr[np.sort(shuffle)] = arr[shuffle]
    df[column] = arr
    return df

class NewsgroupDataset(torch.utils.data.Dataset):
    def __init__(self, split: str):

        newsgroups = fetch_20newsgroups(subset=split, remove=('headers', 'footers', 'quotes'))

        self.dataset = pd.DataFrame()
        self.dataset["text"] = newsgroups.data
        self.dataset["label"] = newsgroups.target
        self.dataset = self.dataset[:23]

        # Shuffle X% of the training dataset to force create mislabeled samples
        if split == "train":
            self.dataset = introduce_label_errors(self.dataset, "label", 10)
            
        # Galileo logging
        self.gconfig = GalileoDataConfig(text=self.dataset['text'], labels=self.dataset['label'])
            
#         # Galileo logging
#         for i in range(len(self.dataset)):
#             payload = {
#                 "id": i,
#                 "text": self.dataset["text"][i],
#                 "gold": str(self.dataset["label"][i]),
#             }
#             g.logger.log_input(payload,
#                                logger_mode="training" if split == "train" else "test")

#         if split == "train":
#             g.logger.log_labels(newsgroups.target_names)

        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
        # string encodings
        self.encodings = tokenizer(self.dataset["text"].tolist(), truncation=True, padding=True)

    def __getitem__(self, idx):
        x = torch.tensor(self.encodings["input_ids"][idx])
        attention_mask = torch.tensor(self.encodings["attention_mask"][idx])
        y = self.dataset["label"][idx]
        return idx, x, attention_mask, y

    def __len__(self):
        return len(self.dataset)

## Step 2: Create a `DistilBERT` model with overloaded `forward()`

In [5]:
"""DistilBERT pytorch class for text classification
"""
import torch
from transformers import DistilBertForSequenceClassification, AdamW, DistilBertConfig, AutoModel
import torch.nn.functional as F

class DistilBERT(torch.nn.Module):
    def __init__(self):
        """
        """
        super(DistilBERT, self).__init__()
        
        self.model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=DistilBertConfig(num_labels=20))
        self.feature_extractor = AutoModel.from_pretrained('distilbert-base-uncased')


    def forward(self, x, attention_mask, x_idxs=None, epoch=None, split=None):
        """

        """
        out = self.model(x, attention_mask=attention_mask)
        log_probs = F.log_softmax(out.logits, dim=1)

        #
        # 🌌 🔭 🤩 Galileo logs output data during forward calls 🌌 🔭 🤩
        #
        probs = F.softmax(out.logits, dim=1)
        encoded_layers = self.feature_extractor(x, return_dict=False)[0]

        self.g_model_config = GalileoModelConfig(emb=[i[0] for i in encoded_layers.tolist()], probs=probs.tolist(), 
                                                 ids=x_idxs.tolist(), epoch=epoch, split=split)
        
#         if logging and x_idxs is not None and epoch is not None:
#             for i in range(len(x_idxs)):
#                 index = int(x_idxs[i])
#                 prob = probs[i].detach().cpu().numpy().tolist()
#                 self.g.logger.log_output(
#                     {
#                         "epoch": epoch,
#                         "id": index,
#                         "emb": [0.0], # coming soon!
#                         "prob": prob,
#                     }
#                 )
        return log_probs

In [6]:
from tqdm import tqdm
import torchmetrics
from dataquality.core.integrations.torch import watch, log_input_data

num_epochs = 5
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = DistilBERT()
model.to(device)
train_dataloader = torch.utils.data.DataLoader(NewsgroupDataset("train"), batch_size=8, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(NewsgroupDataset("test"), batch_size=8, shuffle=True)

log_input_data(train_dataloader, 'training')
log_input_data(val_dataloader, 'validation')

optimizer = torch.optim.AdamW(
            filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5
        )

train_acc = torchmetrics.Accuracy()
val_acc = torchmetrics.Accuracy()

watch(model)

for epoch in range(2):  
    model.train()
    running_loss = 0.0
    for i, data in enumerate(tqdm(train_dataloader)):
        # get the inputs; data is a list of [inputs, labels]
        # sometimes x_idxs is being set to None... why? @Nikita
        x_idxs, x, attention_mask, y = data
        x = x.to(device)
        attention_mask = attention_mask.to(device)
        y = torch.tensor(y, device=device)

        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        log_probs = model(x, attention_mask, x_idxs=x_idxs, epoch=epoch, split='training')
        loss = F.nll_loss(log_probs, y)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        train_acc(torch.argmax(log_probs.to("cpu"), 1), y.to("cpu"))
        if i % 400 == 0:    # print every 400 mini-batches
            print('[epoch %d, iteration %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 400))
            running_loss = 0.0
            print(f"Train accuracy: {train_acc.compute()}")

    
    with torch.no_grad():
        validation_loss = 0.0
        for i, data in enumerate(tqdm(val_dataloader)):
            x_idxs, x, attention_mask, y = data

            x = x.to(device)
            attention_mask = attention_mask.to(device)
            y = torch.tensor(y, device=device)
            
            log_probs = model(x, attention_mask, epoch=epoch, split='validation', x_idxs=x_idxs)
            loss = F.nll_loss(log_probs, y)

            running_loss += loss.item()
            val_acc(torch.argmax(log_probs.to("cpu"), 1), y.to("cpu"))

        print('[epoch %d] Validation loss: %.3f' %
                  (epoch + 1, running_loss / i))
        print(f"Val accuracy: {val_acc.compute()}")

print('Finished Training')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  y = torch.tensor(y, device=device)
 33%|███████████████████████████████████████████████████████████████████▋                                                                                                                                       | 1/3 [00:20<00:40, 20.33s/it]

[epoch 1, iteration     1] loss: 0.007
Train accuracy: 0.125


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:57<00:00, 19.14s/it]
  y = torch.tensor(y, device=device)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:27<00:00,  9.29s/it]


[epoch 1] Validation loss: 7.455
Val accuracy: 0.1304347813129425


 33%|███████████████████████████████████████████████████████████████████▋                                                                                                                                       | 1/3 [00:19<00:39, 19.59s/it]

[epoch 2, iteration     1] loss: 0.007
Train accuracy: 0.06451612710952759


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:56<00:00, 18.83s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:27<00:00,  9.09s/it]

[epoch 2] Validation loss: 7.406
Val accuracy: 0.17391304671764374
Finished Training





In [7]:
dataquality.finish()

☁️ Uploading Data
🧹 Cleaning up


In [8]:
dataquality.config


Config(api_url='http://localhost:8000', minio_url='127.0.0.1:9000', minio_access_key='minioadmin', minio_secret_key='minioadmin', auth_method=<AuthMethod.email: 'email'>, token='eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJiZW5AcnVuZ2FsaWxlby5pbyIsImV4cCI6MTYzMzcyMTIyMH0.oY37cpqnQ4igpCgtyWIKDYqkhWYhZ-AF3JK0IfedW_w', current_user='ben@rungalileo.io', current_project_id='3b4ef80c-64c0-4970-b5d7-ce34355a4ec8', current_run_id='75a1f0ba-e409-46bc-b910-a2ff0b1bac65')