# Multi Label Text Classification using Pytorch and 🔭 Galileo

In this tutorial, we'll train a model with PyTorch and explore the results in Galileo.

**Make sure to select GPU in your Runtime! (Runtime -> Change Runtime type)**

In [None]:
#@title Install `dataquality`
try:
    import dataquality as dq
except ImportError:
    # Upgrade pip
    !pip install -U pip &> /dev/null

    # Install all dependecies
    !pip install -U dataquality torch torchmetrics datasets transformers &> /dev/null
    
    print('👋 Installed necessary libraries and restarting runtime! This should only need to happen once.')
    print('🙏 Continue with the rest of the notebook or hit "Run All" again!')

    # Restart the runtime
    import os, time
    time.sleep(1) # gives the print statements time to flush
    os._exit(0) # exits without allowing the next cell to run

# 1. Login to Galileo

In [None]:
import dataquality as dq

dq.login()

# 2. Load Data

In [None]:
#@title 🤗 HuggingFace Dataset
#@markdown You can find more datasets [here](https://huggingface.co/datasets?language=language:en&task_categories=task_categories:text-classification&task_ids=task_ids:multi-label-classification&sort=downloads).

dataset_name = 'lex_glue' #@param ["go_emotions", "lex_glue"] {allow-input: true}
print(f"You selected the {dataset_name} dataset")

from IPython.utils import io
from datasets import load_dataset, get_dataset_config_names

# Try to load the data. If a config (subset) is needed, pick one
try:
  with io.capture_output() as captured:
    data = load_dataset(dataset_name)
except ValueError as e:
  if "Config name is missing" not in repr(e):
    raise e

  configs = get_dataset_config_names(dataset_name)
  print(f"The dataset {dataset_name} has multiple subsets {configs}.")
  config = input(f"🖖 Enter the name of the subset to pick (or leave blank for any): ")
  if config:
    assert config in configs, f"{config} is not a valid subset"
  else:
    config = configs[0]
  with io.capture_output() as captured:
    data = load_dataset(dataset_name, name=config)


# Check that the dataset has at least train and either of validation/test
assert "train" in data and {"validation", "test"}.intersection(data), \
f"💾 The dataset {dataset_name} has either no train, or no validation or test splits, select another one."

print(f"\n🏆 Dataset {dataset_name} loaded succesfully")



In [None]:
#@markdown Convert HF dataset to Pandas dataframes 
import pandas as pd
import numpy as np

def load_pandas_df(data):
  # Find the name of the ground truth column
  good_col_names = [name for name in list(data['train'].features) if "label" in name]
  if len(good_col_names) == 1:
    label_col = good_col_names[0]
  else:
    col_names = list(data['train'].features)
    print(f"The name of the columns are {col_names}.")
    label_col = input(f"🏅 Please enter the name of the column containing the labels: ")
    assert label_col in col_names, f"{label_col} is not an existing column"

  # Load the labels in a dictionary
  num_classes = len(data['train'].features[label_col].feature.names)
  labels_cols = data['train'].features[label_col].feature.int2str(range(0, num_classes))

  def binarize_label_indices(label_idxs):
    a = np.zeros(len(labels_cols), dtype=int)
    a[label_idxs] = 1
    return a

  # Load the train data into a frame
  train_data = data["train"]
  train_df = pd.DataFrame.from_dict(train_data)
  train_labels = list(map(binarize_label_indices, data['train'][label_col]))
  _train_df_labels = pd.DataFrame(train_labels, columns=labels_cols)
  train_df = pd.concat([train_df, _train_df_labels], axis=1)
  train_df['id'] = train_df.index

  # Load the test data into a frame
  test_split_name = "validation" if "validation" in data else "test"
  test_data = data[test_split_name]
  test_df = pd.DataFrame.from_dict(test_data)
  test_labels = list(map(binarize_label_indices, data[test_split_name][label_col]))
  _test_df_labels = pd.DataFrame(test_labels, columns=labels_cols)
  test_df = pd.concat([test_df, _test_df_labels], axis=1)
  test_df['id'] = test_df.index
  
  return train_df, test_df, labels_cols

# data = load_dataset(hf_dataset)
train_df, test_df, labels_cols = load_pandas_df(data)


# 3. Initialize Galileo

In [None]:
# 🔭🌕 Galileo logging
dq.init(task_type="text_multi_label", 
        project_name="multi_label_text_classification_pytorch", 
        run_name=f"example_run_{dataset_name.replace('/', '-')}")

# 4. Log input data with Galileo
Input data can be logged via `log_data_samples` (or `log_dataset` for logging iterables). This step will log input samples, gold labels, data split, and list of all labels. You can achieve this by adding 1 line of code to the standard PyTorch Dataset Class.

In [None]:
import numpy as np
import torch
from transformers import AutoTokenizer
from typing import List

class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, dataset: pd.DataFrame, split: str, list_of_labels: List[str] = None):

        self.dataset = dataset
        self.num_samples = len(self.dataset)

        self.labels = list_of_labels
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        self.encodings = tokenizer(
            self.dataset["text"].tolist(), truncation=True, padding=True
        )
        
        # 🔭🌕 Galileo logging
        dq.log_data_samples(
            texts=self.dataset["text"],
            task_labels=self.dataset[self.labels].apply(lambda row: list(row[row == 1].index.values), axis=1),
            ids=self.dataset["id"],
            split=split,
        )

    def __getitem__(self, idx):
        x = torch.tensor(self.encodings["input_ids"][idx])
        attention_mask = torch.tensor(self.encodings["attention_mask"][idx])
        y = torch.tensor(self.dataset[self.labels].iloc[idx])
        return x, attention_mask, self.dataset['id'].iloc[idx], y

    def __len__(self):
        return len(self.dataset)

# 🔭🌕 Galileo logging
dq.set_tasks_for_run(labels_cols, binary=True)

train_dataset = MultiLabelDataset(
    train_df, 
    split="training", 
    list_of_labels = labels_cols, 
)

test_dataset = MultiLabelDataset(
    test_df, 
    split="validation",
    list_of_labels = labels_cols,
)


# 5. Log model data with Galileo

Model data can be logged via `log_model_outputs`. This step will log the model logits and embeddings. You can achieve this by adding 1 line of code to the standard pytorch model. 

We log [CLS]-token embedding from final layer, but you can log any custom layer for embeddings. 

In [None]:
import torch
import torch.nn.functional as F
from torch.nn import Linear
from transformers import AutoModel

class TextMultiLabelClassificationModel(torch.nn.Module):

    def __init__(self, num_tasks):
        super().__init__()
        self.feature_extractor = AutoModel.from_pretrained("distilbert-base-uncased")
        hidden_size = self.feature_extractor.config.hidden_size
        
        self.pre_classifier = torch.nn.Linear(hidden_size, hidden_size)
        self.classifier = torch.nn.Linear(hidden_size, num_tasks)

        self.dropout = torch.nn.Dropout(0.1)
        self.relu = torch.nn.ReLU()

    def forward(self, x, attention_mask, ids):
        """Model forward function."""
        encoded_layers = self.feature_extractor(
            input_ids=x, attention_mask=attention_mask
        ).last_hidden_state
        # Extract [CLS]-token
        classification_embedding = encoded_layers[:, 0]

        emb = self.pre_classifier(classification_embedding)
        emb = self.relu(emb)
        emb = self.dropout(emb)
        logits = self.classifier(emb)

        # 🔭🌕 Galileo logging
        dq.log_model_outputs(
            embs=classification_embedding, logits=logits, ids=ids
        )

        return logits

# 6. Putting into Action: Training a Model

We complete the training pipeline by using a standard PyTorch training setup. While training, we log the current `epoch` and `split`. To complete logging, we call `dq.finish()` after training.

In [None]:
import numpy as np
import random
import torch
import torch.nn.functional as F
import torchmetrics
from tqdm.notebook import tqdm
from torchmetrics import HammingDistance

BATCH_SIZE = 32
NUM_EPOCHS = 5

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

train_dataloader = torch.utils.data.DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
)
val_dataloader = torch.utils.data.DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
)

model = TextMultiLabelClassificationModel(num_tasks=len(train_dataset.labels))
model.to(device)

optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5
)

train_acc = torchmetrics.Accuracy()
val_acc = torchmetrics.Accuracy()

train_hamming_distance = HammingDistance()
val_hamming_distance = HammingDistance()

for epoch in range(NUM_EPOCHS):
    # 🔭🌕 Galileo logging
    dq.set_epoch(epoch)

    model.train()
    running_loss = 0.0

    # 🔭🌕 Galileo logging
    dq.set_split("training")
    
    for batch, data in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        
        x, attention_mask, x_idxs, y = data
        x = x.to(device)
        attention_mask = attention_mask.to(device)
        y = y.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        logits = model(x, attention_mask, x_idxs)
        loss = F.binary_cross_entropy_with_logits(logits, y.float())
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    # Track batch level hamming loss!
    train_hamming_distance(logits.to("cpu"), y.to("cpu"))
    print("[epoch %d]" % (epoch + 1))
    print({'loss': running_loss / (batch + 1),
                              "hamming_score": str(
                                  "%.3f" % train_hamming_distance.compute())})

    model.eval()
    with torch.no_grad():
        # 🔭🌕 Galileo logging
        dq.set_split("validation")

        validation_loss = 0.0
        for batch, data in enumerate(val_dataloader):
            x, attention_mask, x_idxs, y = data

            x = x.to(device)
            attention_mask = attention_mask.to(device)
            y = y.to(device)

            logits = model(x, attention_mask, x_idxs)
            loss = F.binary_cross_entropy_with_logits(logits, y.float())

            validation_loss += loss.item()
            val_hamming_distance(logits.to("cpu"), y.to("cpu"))

        print("[epoch %d] Validation loss: %.3f" % (
        epoch + 1, validation_loss / (batch + 1)))
        print(f"Val hamming score: {1 - val_hamming_distance.compute()}")

    train_hamming_distance.reset()
    val_hamming_distance.reset()

print("Finished Training")

# 🔭🌕 Galileo logging
dq.finish()

# General Help and Docs
- To get help with your task's requirements, call `dq.get_data_logger().doc()`
- To see more general data and model logging docs, run `dq.docs()`

In [None]:
dq.get_data_logger().doc()
help(dq.log_dataset)