In [6]:
!pip install -q datasets

# Load and prepare dataset

In [7]:
from datasets import load_dataset
dataset = load_dataset("imdb")

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [10]:
dataset = {
    split: ds.shuffle(seed=42).select(range(int(0.25 * len(ds))))
    for split, ds in dataset.items()
}
dataset

{'train': Dataset({
     features: ['text', 'label'],
     num_rows: 6250
 }),
 'test': Dataset({
     features: ['text', 'label'],
     num_rows: 6250
 }),
 'unsupervised': Dataset({
     features: ['text', 'label'],
     num_rows: 12500
 })}

In [11]:
dataset["train"][0]

{'text': 'There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...',
 'label': 1}

## Step 3: Preprocess the Data
Tokenize the dataset using the tokenizer associated with the pre-trained model.

In [13]:
from transformers import AutoTokenizer
from datasets import DatasetDict

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_fn(example):
  return tokenizer(example['text'],padding = 'max_length', truncation = True)

dataset = DatasetDict(dataset)
tokenize_dataset = dataset.map(tokenize_fn, batched = True)

Map:   0%|          | 0/6250 [00:00<?, ? examples/s]

Map:   0%|          | 0/6250 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

In [14]:
# tokenize_dataset['train'][0]

## Step 4: Set Up the Training Arguments
Specify the hyperparameters and training settings.

In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    eval_strategy ="epoch",     # Evaluate every epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=1,              # Number of training epochs
    weight_decay=0.01,               # Strength of weight decay
)

## Step 5: Initialize the Model
Load the pre-trained model and define the training procedure.

In [16]:
from transformers import AutoModelForSequenceClassification, Trainer

#initialize model
model  = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",num_labels = 2)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenize_dataset['train'],
    eval_dataset = tokenize_dataset['test'],
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step 6: Train the Model
Fine-tune the pre-trained model on your specific dataset.

In [17]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msinghsomendra[0m ([33msinghsomendra-google[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.214197


TrainOutput(global_step=391, training_loss=0.3116119462815697, metrics={'train_runtime': 854.395, 'train_samples_per_second': 7.315, 'train_steps_per_second': 0.458, 'total_flos': 1644444096000000.0, 'train_loss': 0.3116119462815697, 'epoch': 1.0})

# Step 7: Evaluate the Model

In [18]:
result = trainer.evaluate()
result

{'eval_loss': 0.21419695019721985,
 'eval_runtime': 198.9482,
 'eval_samples_per_second': 31.415,
 'eval_steps_per_second': 1.965,
 'epoch': 1.0}

In [19]:
# Save the model
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-tokenizer')

('./fine-tuned-tokenizer/tokenizer_config.json',
 './fine-tuned-tokenizer/special_tokens_map.json',
 './fine-tuned-tokenizer/vocab.txt',
 './fine-tuned-tokenizer/added_tokens.json',
 './fine-tuned-tokenizer/tokenizer.json')