# Text Classification using PyTorch and 🔭 Galileo

In this tutorial, we'll train a model with PyTorch and explore the results in Galileo.

**Make sure to select GPU in your Runtime! (Runtime -> Change Runtime type)**

In [None]:
#@title Install `dataquality` with `pip install dataquality`
try:
    import dataquality as dq
except ImportError:
    # Upgrade pip and install dependencies
    !pip install -U pip &> /dev/null
    !pip install dataquality transformers datasets torchmetrics==0.10.0 --upgrade 1> /dev/null

    print('👋 Installed necessary libraries.')
    print('🙏 Continue with the rest of the notebook or hit "Run All" again!')


# 1. Load Data

In [None]:
#@title 🤗 HuggingFace Dataset
#@markdown You can find more datasets [here](https://huggingface.co/datasets?language=language:en&task_categories=task_categories:text-classification&task_ids=task_ids:multi-class-classification&sort=downloads).

dataset_name = "generalization/newsgroups_Full-p_1" #@param ["generalization/newsgroups_Full-p_1", "banking77", "emotion", "rungalileo/conv_intent"] {allow-input: true}
print(f"You selected the {dataset_name} dataset")

from IPython.utils import io
from datasets import load_dataset, get_dataset_config_names

# Try to load the data. If a config (subset) is needed, pick one
try:
  with io.capture_output() as captured:
    data = load_dataset(dataset_name)
except ValueError as e:
  if "Config name is missing" not in repr(e):
    raise e

  configs = get_dataset_config_names(dataset_name)
  print(f"The dataset {dataset_name} has multiple subsets {configs}.")
  config = input(f"🖖 Enter the name of the subset to pick (or leave blank for any): ")
  if config:
    assert config in configs, f"{config} is not a valid subset"
  else:
    config = configs[0]
  with io.capture_output() as captured:
    data = load_dataset(dataset_name, name=config)

# Check that the dataset has at least train and either of validation/test
assert "train" in data and {"validation", "test"}.intersection(data), \
f"💾 The dataset {dataset_name} has either no train, or no validation or test splits, select another one."

print(f"\n🏆 Dataset {dataset_name} loaded succesfully")
# A small function for minimizing the dataset for testing purposes
import os

import os

def _minimize_for_ci() -> bool:
    return os.getenv("MINIMIZE_FOR_CI", "false") == "true"

if _minimize_for_ci():
  # This is for github testing on cpu.
  data["train"] = data["train"].select(range(10))
  data["test"] = data["test"].select(range(10))



In [None]:
#@title 2. Preprocess dataset and prepare training
#@markdown Most PyTorch training is done with the Dataloader.
#@markdown After preprocessing the data (tokenizing) we pass it on to it.
#@markdown Galileo relies on PyTorch's DataLoader.
#@markdown
#@markdown ```python
#@markdown train_dataloader = DataLoader(train_ds, batch_size=32, shuffle=True)
#@markdown test_dataloader = DataLoader(test_ds, batch_size=BATCH_SIZE)
#@markdown ```

import torch
import torch.nn.functional as F
from torch.nn import Linear
from transformers import AutoModel
import torchmetrics
from tqdm import tqdm
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
BATCH_SIZE = 32

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess(row):
  return tokenizer(row["text"], padding="max_length", truncation=True)

encoded_data = data.map(preprocess, batched=True)
dataloader_columns = ["input_ids","attention_mask","label"]
train_ds = encoded_data["train"].with_format("torch", dataloader_columns)
test_ds = encoded_data["test"].with_format("torch", dataloader_columns)

train_dataloader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=True)



class TextClassificationModel(torch.nn.Module):
    """Defines a Pytorch text classification bert based model."""

    def __init__(self, num_labels: int):
        super().__init__()
        self.feature_extractor = AutoModel.from_pretrained("distilbert-base-uncased")
        self.classifier = Linear(self.feature_extractor.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        """Model forward function."""
        encoded_layers = self.feature_extractor(
            input_ids=input_ids, attention_mask=attention_mask
        ).last_hidden_state
        classification_embedding = encoded_layers[:, 0]
        return self.classifier(classification_embedding)

NUM_EPOCHS = 5
model = TextClassificationModel(len(data['train'].features["label"].names))

optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
num_classes = len(data['train'].features["label"].names)
val_acc = torchmetrics.Accuracy().to(device)
train_acc = torchmetrics.Accuracy().to(device)
loss_fn = F.cross_entropy

model.to(device)

def extract_batch(batch):
  x = batch.pop("input_ids").to(device)
  y = batch.pop("label").to(device)
  attention_mask = batch.pop("attention_mask").to(device)
  return x, y, attention_mask

def train_loop(model,dataloader,optimizer,loss_fn):
  model.train()
  running_loss = 0.0
  for batch in tqdm(dataloader):
      # print statistics
      x, y, attention_mask = extract_batch(batch)
      # zero the parameter gradients
      optimizer.zero_grad()
      # forward + backward + optimize
      logits = model(x, attention_mask)
      loss = loss_fn(logits, y)
      loss.backward()
      optimizer.step()
      train_acc(logits.argmax(1), y)
      running_loss += loss.item()

def test_loop(model, dataloader, loss_fn, epoch):
  model.eval()
  validation_loss = 0.0
  pbar = tqdm(dataloader)
  for batch in pbar:
    x, y, attention_mask = extract_batch(batch)
    with torch.no_grad():
      logits = model(x, attention_mask)
      loss = loss_fn(logits, y)
    val_acc(logits.argmax(1), y)
    validation_loss += loss.item()
    pbar.set_description("[epoch %d] Validation loss: %.3f" % (epoch + 1, validation_loss))
  pbar.set_description(f"Val accuracy: {val_acc.compute()}")

print("Preview of dataset:")
example_df = data["train"].to_pandas().head()
example_df["label"] = example_df["label"].map({
    i:label
    for i,label in enumerate(data["train"].features["label"].names)
})
example_df

## 3. Monitor with Galileo

After simply logging the orignal dataset with the following columns:
```id,text,label```

We can hook the dataquality client in our model and dataloaders.

```python
import dataquality as dq
from dataquality.integrations.torch import watch

dq.init(...)
dq.log_dataset(...)
dq.set_labels_for_run(...)
watch(model, [train_dataloader, test_dataloader])
```



In [None]:
import dataquality as dq
from dataquality.integrations.torch import watch

# 🔭🌕 Initialize the project
dq.init('text_classification',
        "text-classification-demo",
        f"run_{dataset_name.replace('/', '-')}"
        )

# Add the indices to the dataset
# so we can log the following columns: id,text,label
train_ds_with_ids = data["train"].map(lambda x,idx: {"id": idx}, with_indices=True)
test_ds_with_ids = data["test"].map(lambda x,idx: {"id": idx}, with_indices=True)

# 🔭🌕 Logging the dataset with Galileo
dq.log_dataset(train_ds_with_ids,
               split="train")

# 🔭🌕 Logging the dataset with Galileo
dq.log_dataset(test_ds_with_ids,
               split="validation")

# 🔭🌕 Logging the labels in order for Galileo
dq.set_labels_for_run(data["train"].features["label"].names)

# 🔭🌕 Monitor the model with Galileo
watch(model, [train_dataloader, test_dataloader])

# 4. Training

After hooking into the model we log epochs and split during the training process.
```python
dq.set_split("train")
dq.set_epoch(epoch)
```

When training is complete
```dq.finish()```
must be called

In [None]:
from tqdm import tqdm
for epoch in range(NUM_EPOCHS):
    dq.set_epoch(epoch) # 🔭🌕 Setting the epoch
    dq.set_split("training") # 🔭🌕 Setting split to training
    train_loop(model,train_dataloader,optimizer,loss_fn)
    dq.set_split("validation") # 🔭🌕 Setting split to validation
    test_loop(model, test_dataloader, loss_fn, epoch)
print("Finished Training")

dq.finish() # 🔭🌕 Finishing the run

# General Help and Docs
- To get help with your task's requirements, call `dq.get_data_logger().doc()`
- To see more general data and model logging docs, run `dq.docs()`

In [None]:
dq.get_data_logger().doc()
help(dq.log_dataset)

In [None]:
# Get insights to the metrics
dq.metrics.get_dataframe(
        "text-classification-demo",
        f"run_{dataset_name.replace('/', '-')}",
        "train").head()