In [16]:
from huggingface_hub import notebook_login



In [17]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
from datasets import load_dataset

In [19]:
imdb = load_dataset("imdb")

In [20]:
imdb["test"][0]

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

In [21]:
from transformers import AutoTokenizer


In [22]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")



In [23]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [24]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)

In [25]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [26]:
import evaluate

accuracy = evaluate.load("accuracy")

In [27]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [28]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}


In [29]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the GPU
model = model.to(device)

training_args = TrainingArguments(
    output_dir="imdb_fine_tuned_text_classification_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/3126 [00:00<?, ?it/s]

{'loss': 0.3201, 'grad_norm': 13.472294807434082, 'learning_rate': 1.6801023672424827e-05, 'epoch': 0.32}
{'loss': 0.2513, 'grad_norm': 6.4144287109375, 'learning_rate': 1.3602047344849649e-05, 'epoch': 0.64}
{'loss': 0.2253, 'grad_norm': 14.290349006652832, 'learning_rate': 1.0403071017274472e-05, 'epoch': 0.96}


  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.20295968651771545, 'eval_accuracy': 0.92216, 'eval_runtime': 246.333, 'eval_samples_per_second': 101.489, 'eval_steps_per_second': 6.345, 'epoch': 1.0}
{'loss': 0.1644, 'grad_norm': 11.528451919555664, 'learning_rate': 7.204094689699297e-06, 'epoch': 1.28}
{'loss': 0.1402, 'grad_norm': 16.22940444946289, 'learning_rate': 4.005118362124121e-06, 'epoch': 1.6}
{'loss': 0.146, 'grad_norm': 13.24781608581543, 'learning_rate': 8.061420345489445e-07, 'epoch': 1.92}


  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.23405487835407257, 'eval_accuracy': 0.93112, 'eval_runtime': 256.13, 'eval_samples_per_second': 97.607, 'eval_steps_per_second': 6.102, 'epoch': 2.0}
{'train_runtime': 1951.9579, 'train_samples_per_second': 25.615, 'train_steps_per_second': 1.601, 'train_loss': 0.2054647334630262, 'epoch': 2.0}


TrainOutput(global_step=3126, training_loss=0.2054647334630262, metrics={'train_runtime': 1951.9579, 'train_samples_per_second': 25.615, 'train_steps_per_second': 1.601, 'total_flos': 6556904415524352.0, 'train_loss': 0.2054647334630262, 'epoch': 2.0})

In [70]:
text = "best best best best best best best worst"

In [71]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="./imdb_fine_tuned_text_classification_model")
classifier(text)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'NEGATIVE', 'score': 0.5027973651885986}]