# Text Classification using PyTorch and 🔭 Galileo

In this tutorial, we'll train a model with PyTorch and explore the results in Galileo.

**Make sure to select GPU in your Runtime! (Runtime -> Change Runtime type)**

In [None]:
#@title Install `dataquality` with `pip install dataquality`
try:
    import dataquality as dq
except ImportError:
    # Upgrade pip and install dependencies
    !pip install -U pip &> /dev/null
    !pip install dataquality transformers datasets torchmetrics==0.10.0 --upgrade 1> /dev/null

    print('👋 Installed necessary libraries.')
    print('🙏 Continue with the rest of the notebook or hit "Run All" again!')


# 1. Load Data

In [None]:
#@title 🤗 HuggingFace Dataset
#@markdown You can select any dataset from [here](https://huggingface.co/datasets?language=language:en&task_categories=task_categories:token-classification&task_ids=task_ids:named-entity-recognition&sort=downloads) which contains train/test splits and an `ner_tags` column.

dataset_name = 'conllpp' #@param ["wnut_17", "conllpp", "wikiann"] {allow-input: true}
print(f"You selected the {dataset_name} dataset")

from IPython.utils import io
from datasets import load_dataset, get_dataset_config_names

# Try to load the data. If a config (subset) is needed, pick one
try:
  with io.capture_output() as captured:
    data = load_dataset(dataset_name)
except ValueError as e:
  if "Config name is missing" not in repr(e):
    raise e

  configs = get_dataset_config_names(dataset_name)
  print(f"The dataset {dataset_name} has multiple subsets {configs}.")
  config = input(f"🖖 Enter the name of the subset to pick (or leave blank for any): ")
  if config:
    assert config in configs, f"{config} is not a valid subset"
  else:
    config = configs[0]
  with io.capture_output() as captured:
    data = load_dataset(dataset_name, name=config)

# A small function for minimizing the dataset for testing
import os

def _minimize_for_ci() -> bool:
    return os.getenv("MINIMIZE_FOR_CI", "false") == "true"

if _minimize_for_ci():
  data["train"] = data["train"].select(range(1000))
  data["test"] = data["test"].select(range(1000))

# Check that the dataset has at least train and test splits
assert {"train", "test"}.issubset(data), \
f"💾 The dataset {dataset_name} does no have train/test splits, please pick another one."

print(f"\n🏆 Dataset {dataset_name} loaded succesfully")

In [None]:
#@title 2. Create model and prepare training

import torch
from torch.nn import Linear
from torch.nn import CrossEntropyLoss
from transformers import AutoModel
import torchmetrics
from tqdm import tqdm
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification

BATCH_SIZE = 32
NUM_EPOCHS = 5

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorForTokenClassification(tokenizer)


class TokenClassificationModel(torch.nn.Module):
  """Defines a Pytorch text classification bert based model."""

  def __init__(self, num_labels: int):
    super().__init__()
    self.feature_extractor = AutoModel.from_pretrained("distilbert-base-uncased")
    self.classifier = Linear(self.feature_extractor.config.hidden_size, num_labels)

  def forward(self, x, attention_mask):
    """Model forward function."""
    encoded_layers = self.feature_extractor(
        input_ids=x, attention_mask=attention_mask
    ).last_hidden_state
    logits = self.classifier(encoded_layers)
    return logits

num_classes = len(data['train'].features['ner_tags'].feature.names)

model = TokenClassificationModel(num_classes)

optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
val_acc = torchmetrics.Accuracy().to(device)
train_acc = torchmetrics.Accuracy().to(device)
loss_fn = CrossEntropyLoss(ignore_index=-100)

model.to(device)

def extract_batch(batch):
  x = batch.pop("input_ids").to(device)
  y = batch.pop("labels").to(device)
  attention_mask = batch.pop("attention_mask").to(device)
  return x, y, attention_mask

def train_loop(model,dataloader,optimizer,loss_fn):
  model.train()
  for batch in tqdm(dataloader):
      # print statistics
      x, y, attention_mask = extract_batch(batch)
      # zero the parameter gradients
      optimizer.zero_grad()
      # forward + backward + optimize
      logits = model(x, attention_mask)
      loss = loss_fn(logits.transpose(1, 2), y)
      loss.backward()
      optimizer.step()
   
def test_loop(model, dataloader, loss_fn, epoch):
  model.eval()
  test_loss, num_test_batches = 0.0, 0
  pbar = tqdm(dataloader)
  for batch in pbar:
    x, y, attention_mask = extract_batch(batch)
    with torch.no_grad():
      logits = model(x, attention_mask)
      loss = loss_fn(logits.transpose(1, 2), y)
    test_loss += loss.item()
    num_test_batches += 1
    pbar.set_description("[epoch %d] Validation loss: %.3f" % (epoch + 1, test_loss / num_test_batches))


## 3. Monitor with Galileo

After simply logging the orignal dataset, we can hook the dataquality client in our model and dataloaders.

```python
import dataquality as dq
from dataquality.integrations import hf
from dataquality.integrations.torch import watch

dq.init(...)
dq.log_dataset(...)
dq.set_labels_for_run(...)
watch(model, [train_dataloader, test_dataloader])
```



In [None]:
import dataquality as dq
from dataquality.integrations import hf
from dataquality.integrations.torch import watch

# 🔭🌕 Initialize the project
dq.init('text_ner',
        "named-entity-recognition-demo",
        f"run_{dataset_name}"
        )


# 🔭🌕 Galileo tokenizes the HuggingFace DatasetDict logs the dataset(s) present in it
tokenized_datasets = hf.tokenize_and_log_dataset(data, tokenizer)
labels = tokenized_datasets['train'].features['ner_tags'].feature.names  

train_dataloader = hf.get_dataloader(tokenized_datasets["train"], collate_fn=data_collator, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = hf.get_dataloader(tokenized_datasets["test"], collate_fn=data_collator, batch_size=BATCH_SIZE, shuffle=False)

# 🔭🌕 Logging the labels in order for Galileo
dq.set_labels_for_run(labels)

# 🔭🌕 Monitor the model with Galileo
watch(model, [train_dataloader, test_dataloader])

# 4. Training

After hooking into the model we log epochs and split during the training process.
```python
dq.set_split("train")
dq.set_epoch(epoch)
```

When training is complete
```dq.finish()```
must be called

In [None]:
from tqdm import tqdm
for epoch in range(NUM_EPOCHS):
    dq.set_epoch(epoch) # 🔭🌕 Setting the epoch
    dq.set_split("training") # 🔭🌕 Setting split to training
    train_loop(model,train_dataloader,optimizer,loss_fn)
    dq.set_split("test") # 🔭🌕 Setting split to validation
    test_loop(model, test_dataloader, loss_fn, epoch)
print("Finished Training")

dq.finish() # 🔭🌕 Finishing the run

# General Help and Docs
- To get help with your task's requirements, call `dq.get_data_logger().doc()`
- To see more general data and model logging docs, run `dq.docs()`

In [None]:
dq.get_data_logger().doc()
help(dq.log_dataset)

In [None]:
# Get insights to the metrics
dq.metrics.get_dataframe(
        "named-entity-recognition-demo",
        f"run_{dataset_name}",
        "train").head()