# Natural Language Inference using Pytorch and 🔭 Galileo

In this tutorial, we'll train a model with PyTorch and explore the results in Galileo.

**Make sure to select GPU in your Runtime! (Runtime -> Change Runtime type)**

In [None]:
#@title Install `dataquality`

# Upgrade pip
!pip install -U pip &> /dev/null

# Install all dependecies
!pip install -U dataquality torch torchmetrics==0.10.0 datasets transformers &> /dev/null

print('👋 Installed necessary libraries.')


# 1. Initialize Galileo

In [None]:
import dataquality as dq
# 🔭🌕 Galileo logging
dq.init(task_type="text_classification", 
        project_name="natural_language_inference_pytorch", 
        run_name=f"example_run_nli_torch_1")

# 2. Load Data

In [None]:
#@title 🤗 HuggingFace Dataset
#@markdown You can select any dataset from [here](https://huggingface.co/datasets?language=language:en&task_categories=task_categories:text-classification&task_ids=task_ids:natural-language-inference&sort=downloads) which contains the columns `premise` and `hypothesis`.

dataset_name = 'hans' #@param ["snli", "sem_eval_2014_task_1", "hans"] {allow-input: true}
print(f"You selected the {dataset_name} dataset")

from IPython.utils import io
from datasets import load_dataset, get_dataset_config_names

# Try to load the data. If a config (subset) is needed, pick one
try:
  with io.capture_output() as captured:
    data = load_dataset(dataset_name)
except ValueError as e:
  if "Config name is missing" not in repr(e):
    raise e

  configs = get_dataset_config_names(dataset_name)
  print(f"The dataset {dataset_name} has multiple subsets {configs}.")
  config = input(f"🖖 Enter the name of the subset to pick (or leave blank for any): ")
  if config:
    assert config in configs, f"{config} is not a valid subset"
  else:
    config = configs[0]
  with io.capture_output() as captured:
    data = load_dataset(dataset_name, name=config)

# Check that the dataset has at least train and either of validation/test
assert "train" in data and {"validation", "test"}.intersection(data), \
f"💾 The dataset {dataset_name} has either no train, or no validation or test splits, select another one."

print(f"\n🏆 Dataset {dataset_name} loaded succesfully")

# A small function for minimizing the dataset for testing
import os

def _minimize_for_ci() -> bool:
    return os.getenv("MINIMIZE_FOR_CI", "false") == "true"

In [None]:
#@markdown Convert HF dataset to Pandas dataframes 
import pandas as pd

def load_pandas_df(data):
  # Find the name of the ground truth column
  good_col_names = [name for name in list(data['train'].features) if "label" in name]
  if len(good_col_names) == 1:
    label_col = good_col_names[0]
  else:
    col_names = list(data['train'].features)
    print(f"The name of the columns are {col_names}.")
    label_col = input(f"🏅 Please enter the name of the column containing the labels: ")
    assert label_col in col_names, f"{label_col} is not an existing column"

  # Load the labels in a dictionary
  labels = data['train'].features[label_col].names
  labels = {v:k for v, k in enumerate(labels)}

  # Load the train data into a frame
  train_data = data["train"]
  train_df = pd.DataFrame.from_dict(train_data)
  train_df['label'] = train_df[label_col].map(labels)
  train_df['id'] = train_df.index
  train_df['text'] = train_df['premise'] + "  <>  " + train_df['hypothesis']

  # Load the test data into a frame
  test_split_name = "validation" if "validation" in data else "test"
  test_data = data[test_split_name]
  test_df = pd.DataFrame.from_dict(test_data)
  test_df['label'] = test_df[label_col].map(labels)
  test_df['id'] = test_df.index
  test_df['text'] = test_df['premise'] + "  <>  " + test_df['hypothesis']
  
  return train_df, test_df

train_df, test_df = load_pandas_df(data)
labels = train_df.label.unique().tolist()

if _minimize_for_ci():
  train_df, test_df = train_df[:10], test_df[:10]

# 3. Log input data with Galileo
Input data can be logged via `log_data_samples` (or `log_dataset` for logging iterables). This step will log input samples, gold labels, data split, and list of all labels. You can achieve this by adding 1 line of code to the standard PyTorch Dataset Class.

In [None]:
import numpy as np
import torch
from transformers import AutoTokenizer
from typing import List

list_of_labels = train_df["label"].unique().tolist()

# 🔭🌕 Galileo logging
dq.set_labels_for_run(list_of_labels)

class TextDataset(torch.utils.data.Dataset):
    def __init__(
        self, dataset: pd.DataFrame, split: str, list_of_labels: List[str] = None
    ):
        self.dataset = dataset

        # 🔭🌕 Galileo logging
        # Note: this works seamlessly because self.dataset has text, label, and
        # id columns. See `help(dq.log_dataset)` for more info
        dq.log_dataset(self.dataset, split=split)

        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        self.encodings = tokenizer(
            self.dataset["text"].tolist(), truncation=True, padding=True
        )

        self.list_of_labels = list_of_labels or self.dataset["label"].unique().tolist()

        self.labels = np.array(
            [self.list_of_labels.index(label) for label in self.dataset["label"]]
        )

    def __getitem__(self, idx):
        x = torch.tensor(self.encodings["input_ids"][idx])
        attention_mask = torch.tensor(self.encodings["attention_mask"][idx])
        y = self.labels[idx]
        sample_idx = self.dataset.id.iloc[idx]
        return sample_idx, x, attention_mask, y

    def __len__(self):
        return len(self.dataset)



train_dataset = TextDataset(train_df, split="training")
test_dataset = TextDataset(
    test_df, 
    split="validation",
    list_of_labels=train_dataset.list_of_labels,
)



# 4. Log model data with Galileo

Model data can be logged via `log_model_outputs`. This step will log the model logits and embeddings. You can achieve this by adding 1 line of code to the standard pytorch model. 

We log [CLS]-token embedding from final layer, but you can log any custom layer for embeddings. 

In [None]:
import torch
import torch.nn.functional as F
from torch.nn import Linear
from transformers import AutoModel


class TextClassificationModel(torch.nn.Module):
    """Defines a Pytorch text classification bert based model."""

    def __init__(self, num_labels: int):
        super().__init__()
        self.feature_extractor = AutoModel.from_pretrained("distilbert-base-uncased")
        self.classifier = Linear(self.feature_extractor.config.hidden_size, num_labels)

    def forward(self, x, attention_mask, ids):
        """Model forward function."""
        encoded_layers = self.feature_extractor(
            input_ids=x, attention_mask=attention_mask
        ).last_hidden_state
        classification_embedding = encoded_layers[:, 0]
        logits = self.classifier(classification_embedding)

        # 🔭🌕 Galileo logging
        dq.log_model_outputs(
            embs=classification_embedding, logits=logits, ids=ids
        )

        return logits

# 5. Putting into Action: Training a Model

We complete the training pipeline by using a standard PyTorch training setup. While training, we log the current `epoch` and `split`. To complete logging, we call `dq.finish()` after training.

In [None]:
import numpy as np
import random
import torch
import torch.nn.functional as F
import torchmetrics
from tqdm.notebook import tqdm

BATCH_SIZE = 32
NUM_EPOCHS = 3

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

train_dataloader = torch.utils.data.DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
)
val_dataloader = torch.utils.data.DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
)

model = TextClassificationModel(num_labels=len(train_dataset.list_of_labels))
model.to(device)

optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5
)

train_acc = torchmetrics.Accuracy()
val_acc = torchmetrics.Accuracy()

for epoch in range(NUM_EPOCHS):
    # 🔭🌕 Galileo logging
    dq.set_epoch(epoch)

    model.train()
    running_loss = 0.0

    # 🔭🌕 Galileo logging
    dq.set_split("training")

    for data in tqdm(train_dataloader):
        x_idxs, x, attention_mask, y = data
        x = x.to(device)
        attention_mask = attention_mask.to(device)
        y = torch.tensor(y, device=device)

        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        log_probs = model(x, attention_mask, x_idxs)
        loss = F.nll_loss(log_probs, y)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        train_acc(torch.argmax(log_probs.to("cpu"), 1), y.to("cpu"))

    model.eval()
    with torch.no_grad():
        # 🔭🌕 Galileo logging
        dq.set_split("validation")

        validation_loss = 0.0
        for data in tqdm(val_dataloader):
            x_idxs, x, attention_mask, y = data

            x = x.to(device)
            attention_mask = attention_mask.to(device)
            y = torch.tensor(y, device=device)

            log_probs = model(x, attention_mask, x_idxs)
            loss = F.nll_loss(log_probs, y)

            validation_loss += loss.item()
            val_acc(torch.argmax(log_probs.to("cpu"), 1), y.to("cpu"))

        print("[epoch %d] Validation loss: %.3f" % (epoch + 1, validation_loss))
        print(f"Val accuracy: {val_acc.compute()}")

print("Finished Training")

# 🔭🌕 Galileo logging
dq.finish()

# General Help and Docs
- To get help with your task's requirements, call `dq.get_data_logger().doc()`
- To see more general data and model logging docs, run `dq.docs()`

In [None]:
dq.get_data_logger().doc()
help(dq.log_dataset)