# Text Classification using PyTorch and 🔭 Galileo

In this tutorial, we'll train a model with PyTorch and explore the results in Galileo.

**Make sure to select GPU in your Runtime! (Runtime -> Change Runtime type)**

In [1]:
#@title Install `dataquality` with `pip install dataquality`
from IPython.display import clear_output
!pip install transformers datasets dataquality torchmetrics==0.8.3a0 torchmetrics vaex-ui -q
clear_output()
print("Installation done!")

Installation done!


## 1. Initialize Galileo
Start your project with two lines of code:


```python
import dataquality as dq
dq.init('text_classification',
        'test_project',
        'example_run')
```

In [None]:
import dataquality as dq
import datetime
dq.set_console_url("https://console.dev.rungalileo.io")
current_time = str(datetime.datetime.now()).split('.')[0].replace(':','-')
# 🔭🌕 Initialize the project
project_name = 'test_project'
run_name = f'example_run_{current_time}'
dq.init('text_classification',
        project_name,
        run_name
        )

# 2. Load Data

In [None]:
#@title 🤗 HuggingFace Dataset
#@markdown You can find more datasets [here](https://huggingface.co/datasets?language=language:en&task_categories=task_categories:text-classification&task_ids=task_ids:multi-class-classification&sort=downloads).

dataset_name = 'emotion' #@param ["rungalileo/conv_intent", "banking77", "emotion", "tweet_eval"] {allow-input: true}
print(f"You selected the {dataset_name} dataset")

from IPython.utils import io
from datasets import load_dataset, get_dataset_config_names

# Try to load the data. If a config (subset) is needed, pick one
try:
  with io.capture_output() as captured:
    data = load_dataset(dataset_name)
except ValueError as e:
  if "Config name is missing" not in repr(e):
    raise e

  configs = get_dataset_config_names(dataset_name)
  print(f"The dataset {dataset_name} has multiple subsets {configs}.")
  config = input(f"🖖 Enter the name of the subset to pick (or leave blank for any): ")
  if config:
    assert config in configs, f"{config} is not a valid subset"
  else:
    config = configs[0]
  with io.capture_output() as captured:
    data = load_dataset(dataset_name, name=config)

# Check that the dataset has at least train and either of validation/test
assert "train" in data and {"validation", "test"}.intersection(data), \
f"💾 The dataset {dataset_name} has either no train, or no validation or test splits, select another one."

print(f"\n🏆 Dataset {dataset_name} loaded succesfully")
# A small function for minimizing the dataset for testing purposes
import os

def _minimize_for_ci() -> bool:
    return os.getenv("MINIMIZE_FOR_CI", "false") == "true"

## 2. Prepare dataset

Galileo relies on PyTorch's DataLoader
```python
train_dataloader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=True)
```



In [None]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
BATCH_SIZE = 32

data["train"] = data["train"].select(range(6000))
data["test"]  = data["test"].select(range(1000))

data = data.map(lambda x,idx : {"id":idx}, with_indices=True)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess(row):
  return tokenizer(row["text"], padding="max_length", truncation=True)

encoded_data = data.map(preprocess, batched=True)
dataloader_columns = ["input_ids","attention_mask","label"]
train_ds = encoded_data["train"].with_format("torch", dataloader_columns)
test_ds = encoded_data["test"].with_format("torch", dataloader_columns)

train_dataloader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=True)

clear_output()
print("Preview of dataset:")
example_df = data["train"].to_pandas().head()
example_df["label"] = example_df["label"].map({
    i:label
    for i,label in enumerate(data["train"].features["label"].names)
})
example_df

In [None]:
#@markdown Convert HF dataset to Pandas dataframes 
import pandas as pd

def load_pandas_df(data):
  # Find the name of the ground truth column
  good_col_names = [name for name in list(data['train'].features) if "label" in name]
  if len(good_col_names) == 1:
    label_col = good_col_names[0]
  else:
    col_names = list(data['train'].features)
    print(f"The name of the columns are {col_names}.")
    label_col = input(f"🏅 Please enter the name of the column containing the labels: ")
    assert label_col in col_names, f"{label_col} is not an existing column"

  # Load the labels in a dictionary
  labels = data['train'].features[label_col].names
  labels = {v:k for v, k in enumerate(labels)}

  # Load the train data into a frame
  train_data = data["train"]
  train_df = pd.DataFrame.from_dict(train_data)
  train_df['label'] = train_df[label_col].map(labels)
  train_df['id'] = train_df.index

  # Load the test data into a frame
  test_split_name = "validation" if "validation" in data else "test"
  test_data = data[test_split_name]
  test_df = pd.DataFrame.from_dict(test_data)
  test_df['label'] = test_df[label_col].map(labels)
  test_df['id'] = test_df.index
  
  return train_df, test_df

train_df, test_df = load_pandas_df(data)

if _minimize_for_ci():
  train_df = train_df[:10]
  test_df = test_df[test_df.label.isin(train_df.label.unique())][:10]

# 3. Prepare Training

In [None]:
import torch
import torch.nn.functional as F
from torch.nn import Linear
from transformers import AutoModel
import torchmetrics


class TextClassificationModel(torch.nn.Module):
    """Defines a Pytorch text classification bert based model."""

    def __init__(self, num_labels: int):
        super().__init__()
        self.feature_extractor = AutoModel.from_pretrained("distilbert-base-uncased")
        self.classifier = Linear(self.feature_extractor.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        """Model forward function."""
        encoded_layers = self.feature_extractor(
            input_ids=input_ids, attention_mask=attention_mask
        ).last_hidden_state
        classification_embedding = encoded_layers[:, 0]
        return self.classifier(classification_embedding)

NUM_EPOCHS = 2
model = TextClassificationModel(len(data['train'].features["label"].names))
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
val_acc = torchmetrics.Accuracy().to(device)
train_acc = torchmetrics.Accuracy().to(device)
loss_fn = F.cross_entropy
model.to(device)

def train_loop(model,dataloader,optimizer,loss_fn):
  model.train()
  running_loss = 0.0
  for batch in tqdm(dataloader):
      # print statistics
      x = batch.pop("input_ids").to(device)
      y = batch.pop("label").to(device)
      attention_mask = batch.pop("attention_mask").to(device)
      # zero the parameter gradients
      optimizer.zero_grad()
      # forward + backward + optimize
      logits = model(x, attention_mask)
      loss = loss_fn(logits, y)
      loss.backward()
      optimizer.step()
      train_acc(logits.argmax(1), y)
      running_loss += loss.item()

def test_loop(model, dataloader, loss_fn):
  model.eval()
  validation_loss = 0.0
  pbar = tqdm(dataloader)
  for batch in pbar:
    x = batch.pop("input_ids").to(device)
    y = batch.pop("label").to(device)
    attention_mask = batch.pop("attention_mask").to(device)
    with torch.no_grad():
      logits = model(x, attention_mask)
      loss = loss_fn(logits, y)
    val_acc(logits.argmax(1), y)
    validation_loss += loss.item()
    pbar.set_description("[epoch %d] Validation loss: %.3f" % (epoch + 1, validation_loss))
  pbar.set_description(f"Val accuracy: {val_acc.compute()}")

## 4. Monitor with Galileo

After simply logging the orignal dataset with the following columns:
```id,text,label```

We can hook the dataquality client in our model and dataloaders.

```python
from dataquality.integrations.torch import watch
watch(model, [train_dataloader, test_dataloader])
```



In [None]:
import dataquality as dq
from dataquality.integrations.torch import watch
# 🔭🌕 Logging the dataset with Galileo
dq.log_dataset(data["train"],
               split="train")

# 🔭🌕 Logging the dataset with Galileo
dq.log_dataset(data["test"],
               split="validation")

# 🔭🌕 Logging the dataset with Galileo
dq.set_labels_for_run(data["train"].features["label"].names)
# 🔭🌕 Monitor the model with Galileo
watch(model, [train_dataloader, test_dataloader])

# 5. Training

After hooking into the model we start the training process.
For each split and epoch we call: ```dq.set_split("train") or dq.set_epoch(epoch)```

In [None]:
from tqdm import tqdm
for epoch in range(NUM_EPOCHS):
    dq.set_epoch(epoch) # 🔭🌕 Setting the epoch
    dq.set_split("training") # 🔭🌕 Setting split to training
    train_loop(model,train_dataloader,optimizer,loss_fn)
    dq.set_split("validation") # 🔭🌕 Setting split to validation
    test_loop(model, test_dataloader, loss_fn)
print("Finished Training")

## 5. Finish and upload data

Upload your training data with:
```python
dq.finish()
```

In [None]:
finished_run = dq.finish()
clear_output()
print(f"Project link: {finished_run['link']}")
print("Preview of misclassified data")
dq.metrics.get_dataframe(project_name, run_name, "train").head()

# General Help and Docs
- To get help with your task's requirements, call `dq.get_data_logger().doc()`
- To see more general data and model logging docs, run `dq.docs()`

In [None]:
dq.get_data_logger().doc()
help(dq.log_dataset)