<a href="https://colab.research.google.com/github/roldanjorge/posts/blob/main/hf_bert_seq_class/sequence_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
This script demonstrates the pipeline for sequence classification using Huggingface transformers.
"""
import os
import torch
from transformers import AutoTokenizer, BertForSequenceClassification


def get_model_tokenizer(checkpoint: str, output_dir: str) -> (AutoTokenizer, BertForSequenceClassification):
    """ Download or load from local and return the model and its tokenizer

    Args:
        checkpoint: Huggingface checkpoint
        output_dir: Directory to store model and tokenizer file

    Returns:
        tokenizer: Tokenizer object
        model: Model object
    """
    if not os.path.exists(output_dir):
        print(f"Model directory {output_dir} does not exist. It will be downloaded from Huggingface")
        os.makedirs(output_dir)

        # Download model and tokenizer
        model = BertForSequenceClassification.from_pretrained(checkpoint)
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)

        # Store model and tokenizer in output_dir
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
    else:
        print(f"Model {output_dir} stored locally. This local version will be uploaded")
        model = BertForSequenceClassification.from_pretrained(checkpoint)
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    return tokenizer, model


def run_pipeline(utterance: str, tokenizer, model: BertForSequenceClassification):
    """ Run the pipeline for the sequence classification task
        Args:
            utterance: Input text
            tokenizer: Tokenizer object
            model: Model object
    """
    print(f"\n{50*'='}\nRunning pipeline: \"{utterance}\"\n{50*'='}")

    # Stage 1: Preprocessing
    print(f"{50*'-'}\nStage 1: Preprocessing \n{50*'-'}")
    inputs = tokenizer(utterance, return_tensors="pt")
    for _input, value in inputs.items():
        print(f"{_input:<15}: \n\t{value}")

    # Stage 2: Model inference
    print(f"\n{50*'-'}\nStage 2: Model inference \n{50*'-'}")
    with torch.no_grad():
        logits = model(**inputs).logits
    print(f"logits: \n\t{logits}")

    # Stage 3: Post-processing
    print(f"\n{50*'-'}\nStage 3: Preprocessing \n{50*'-'}")
    predictions = torch.nn.functional.softmax(logits, dim=-1)
    print(f"probabilities: \n\t{predictions}")
    print(f"id2label: \n\t{model.config.id2label}")
    print(f"predictions:")
    for _id, label in model.config.id2label.items():
        print(f"\t{label:<7}:\t{round(float(predictions[0][_id]), 3)}")


def main():
    # Setup tokenizer and model
    checkpoint = "nlptown/bert-base-multilingual-uncased-sentiment"
    output_dir = 'hf_bert_seq_class/model'
    tokenizer, model = get_model_tokenizer(checkpoint=checkpoint, output_dir=output_dir)

    # Positive review
    run_pipeline(utterance="I really loved that movie", tokenizer=tokenizer, model=model)

    # Negative review
    run_pipeline(utterance="I hate very cold, and cloudy winter days", tokenizer=tokenizer, model=model)


if __name__ == "__main__":
    main()


Model directory hf_bert_seq_class/model does not exist. It will be downloaded from Huggingface


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]


Running pipeline: "I really loved that movie"
--------------------------------------------------
Stage 1: Preprocessing 
--------------------------------------------------
input_ids      : 
	tensor([[  101,   151, 25165, 46747, 10203, 13113,   102]])
token_type_ids : 
	tensor([[0, 0, 0, 0, 0, 0, 0]])
attention_mask : 
	tensor([[1, 1, 1, 1, 1, 1, 1]])

--------------------------------------------------
Stage 2: Model inference 
--------------------------------------------------
logits: 
	tensor([[-2.3669, -2.2634, -0.4449,  1.5619,  2.7230]])

--------------------------------------------------
Stage 3: Preprocessing 
--------------------------------------------------
probabilities: 
	tensor([[0.0045, 0.0050, 0.0308, 0.2289, 0.7309]])
id2label: 
	{0: '1 star', 1: '2 stars', 2: '3 stars', 3: '4 stars', 4: '5 stars'}
predictions:
	1 star :	0.005
	2 stars:	0.005
	3 stars:	0.031
	4 stars:	0.229
	5 stars:	0.731

Running pipeline: "I hate very cold, and cloudy winter days"
-------------------