# 🔭 Text Classification using Huggingface Trainer  🔭

In this tutorial, we'll train a model with Huggingface's transformers and explore the results in Galileo.

**Make sure to select GPU in your Runtime! (Runtime -> Change Runtime type)**

In [None]:
#@title Install `dataquality`
try:
    import dataquality as dq
except ImportError:
    # Upgrade pip
    !pip install -U pip &> /dev/null
    # Install HF datasets for downloading the example datasets
    !pip install -U dataquality datasets evaluate transformers &> /dev/null
    
    print('👋 Installed necessary libraries and restarting runtime! This should only need to happen once.')
    print('🙏 Continue with the rest of the notebook or hit "Run All" again!')

    # Restart the runtime
    import os, time
    time.sleep(1) # gives the print statements time to flush
    os._exit(0) # exits without allowing the next cell to run

# 1. Login to Galileo

In [None]:
dq.login()

# 2. Load Data

In [None]:
#@title 🤗 HuggingFace Dataset
#@markdown You can select any dataset from [here](https://huggingface.co/datasets?language=language:en&task_categories=task_categories:text-classification&task_ids=task_ids:multi-class-classification&sort=downloads).

dataset_name = 'emotion' #@param ["banking77", "emotion", "ag_news"] {allow-input: true}
print(f"You selected the {dataset_name} dataset")

from IPython.utils import io
from datasets import load_dataset

with io.capture_output() as captured:
    ds = load_dataset(dataset_name)

# 3. Initialize Galileo

In [None]:
# 🔭🌕 Galileo logging
dq.init(task_type="text_classification", 
        project_name="text_classification_huggingface", 
        run_name=f"example_run_{dataset_name}_01")

# 4. Log Input Data with Galileo
Input data needs to have the id column. This can be added through the datasets function map. Afterwards we simply log each split and the label names

In [None]:
# 🔭🌕 Galileo preprocessing
ds = ds.map(lambda x,idx : {"id":idx},with_indices=True)

# We will train on a subset of the dataset to improve speed
train_dataset = ds["train"].select(range(4000))
test_dataset = ds["test"].select(range(2000))

# 🔭🌕 Galileo logging
dq.log_dataset(train_dataset, split="train")
dq.log_dataset(test_dataset, split="validation")
dq.set_labels_for_run(train_dataset.features['label'].names)


# 5. Putting into Action: Training a Model

We complete the training pipeline by using a standard PyTorch training setup. While training, we log the current `epoch` and `split`. To complete logging, we call `dq.finish()` after training.

In [None]:
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
from datasets import load_metric
import evaluate

num_labels = len(ds["train"].features["label"].names)

model_checkpoint = "microsoft/xtremedistil-l6-h256-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

metric = evaluate.load("accuracy")


# Taken from the docs of the trainer module:
#https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py#L434
def preprocess_function(input_data, tokenizer):
    return tokenizer(input_data["text"],
                   padding="max_length",max_length=201 ,
                   truncation=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax( axis=1)
    return metric.compute(predictions=predictions, references=labels)


encoded_train_dataset = train_dataset.map(lambda x: preprocess_function(x,tokenizer),batched=True) 
encoded_test_dataset =  test_dataset.map(lambda x: preprocess_function(x,tokenizer),batched=True) 

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

#Training arguments and training part
metric_name = "accuracy"
batch_size= 64
args = TrainingArguments(
    f"finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    report_to="all",
    seed=42,
    data_seed=42
    )

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# 🔭🌕 Galileo logging
from dataquality.integrations.transformers_trainer import  watch

watch(trainer)

In [None]:
trainer.train()


In [None]:
dq.finish() # 🔭🌕 Galileo logging

# General Help and Docs
- To get help with your task's requirements, call `dq.get_data_logger().doc()`
- To see more general data and model logging docs, run `dq.docs()`

In [None]:
dq.get_data_logger().doc()
help(dq.log_dataset)