# Text classification & Finetuning distilBERT

#### Finetune DistilBERT on the IMDb dataset to determine whether a movie review is positive or negative.


In [4]:
# Load IMDb dataset
from datasets import load_dataset
imdb = load_dataset("imdb")

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
# take a look at the example
imdb["train"][55]

{'text': 'If the term itself were not geographically and semantically meaningless, one might well refer to "Ned Kelly" as an "Australian Western." For the people Down Under, Ned Kelly was, apparently, a folk hero bandit akin to Robin Hood, Jesse James, Bonnie and Clyde, and Butch Cassidy and the Sundance Kid. The descendant of Irish immigrants, Kelly became a fugitive and an outlaw after he was falsely accused of shooting an Australian law officer, a crime for which his equally innocent mother was put into prison. To get back at the government for this mistreatment, Kelly, his brother Dan, and two other companions, became notorious bank robbers, winning over the hearts of many people in the countryside while striking a blow for justice in a land where Irish immigrants were often treated with disrespect and disdain by those who ran the country.<br /><br />Perhaps because we\'ve encountered this "gentleman bandit" scenario so many times in the past, "Ned Kelly" feels awfully familiar and

In [6]:
# Perprocess 
# Load distilBERT tokenizer to preprocess the text

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
# Create a preprocessing function to tokenize text and
# truncate sequences to be no longer than DistilBERT’s maximum input length:

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

In [9]:
# preprocessing function over the entire dataset and use 🤗 Datasets map function

tokenized_imdb = imdb.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [18]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)


### Evaluate

In [12]:
import evaluate
accuracy = evaluate.load('accuracy')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
# function that passes your predictions
# and labels to compute to calculate the accuracy:

import numpy as np

def compute_matrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [14]:
# Train
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id

)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_matrics,
)


trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2187,0.194509,0.924
2,0.1424,0.235622,0.93084


TrainOutput(global_step=3126, training_loss=0.20482150362762822, metrics={'train_runtime': 2405.0018, 'train_samples_per_second': 20.79, 'train_steps_per_second': 1.3, 'total_flos': 6556904415524352.0, 'train_loss': 0.20482150362762822, 'epoch': 2.0})

In [20]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/Proxiii/my_awesome_model/commit/ed7d92854f507e1f200485f853ec1af00de12745', commit_message='End of training', commit_description='', oid='ed7d92854f507e1f200485f853ec1af00de12745', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Proxiii/my_awesome_model', endpoint='https://huggingface.co', repo_type='model', repo_id='Proxiii/my_awesome_model'), pr_revision=None, pr_num=None)

### Inference

In [25]:
text = "This was the shittist movie, It was just okish."

In [26]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="Proxiii/my_awesome_model")
classifier(text)

Device set to use cuda:0


[{'label': 'NEGATIVE', 'score': 0.9916760921478271}]