# Fine Tuning in Torch


In [43]:
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModel, TrainingArguments, AutoModelForSequenceClassification, Trainer
from datasets import load_dataset
import torch 

In [2]:
# importing data and defining checkpoints

dataset = load_dataset ('glue', 'mrpc')
checkpoint = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained (checkpoint)
data_collector_padding = DataCollatorWithPadding (tokenizer=tokenizer)
training_args = TrainingArguments ("test-trainer")
model = AutoModelForSequenceClassification.from_pretrained (checkpoint)

print (dataset)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})


In [3]:
train_dataset = dataset['train']
print (len (train_dataset))
train_dataset[:3]

3668


{'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
  "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
  'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .'],
 'sentence2': ['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
  "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
  "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale ."],
 'label': [1, 0, 1],
 'idx': [0, 1, 2]}

In [4]:
# creating tokenizer funciton to to tokenize the dataset

def tockeinzer_df (dataset_to_tockenize):
    return  tokenizer (dataset_to_tockenize['sentence1'], dataset_to_tockenize['sentence2'], truncation=True) 

In [5]:
tain_data_tockenized =  train_dataset.map (tockeinzer_df)

In [6]:
tain_data_tockenized[:3].keys()

dict_keys(['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'])

In [7]:
valid_dataset = dataset['validation']
print (valid_dataset)
valid_dataset_tok = valid_dataset.map (tockeinzer_df)
print (valid_dataset_tok)
valid_dataset_tok.column_names

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 408
})
Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 408
})


['sentence1',
 'sentence2',
 'label',
 'idx',
 'input_ids',
 'token_type_ids',
 'attention_mask']

In [8]:
trainer = Trainer(

    model, 
    training_args, 
    train_dataset = tain_data_tockenized, 
    eval_dataset = valid_dataset_tok, 
    data_collator = data_collector_padding, 
    tokenizer= tokenizer,

)

In [23]:

trainer.train()



  0%|          | 0/1377 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.6332, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}
{'loss': 0.4943, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}
{'train_runtime': 821.5063, 'train_samples_per_second': 13.395, 'train_steps_per_second': 1.676, 'train_loss': 0.5141200756109012, 'epoch': 3.0}


TrainOutput(global_step=1377, training_loss=0.5141200756109012, metrics={'train_runtime': 821.5063, 'train_samples_per_second': 13.395, 'train_steps_per_second': 1.676, 'train_loss': 0.5141200756109012, 'epoch': 3.0})

## Evaluation on Validation set

In [24]:
prediction = trainer.predict (valid_dataset_tok)
print (prediction.predictions.shape)

  0%|          | 0/51 [00:00<?, ?it/s]

(408, 2)


In [30]:

prediction.label_ids

array([1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,

In [31]:
import numpy as np

preds = np.argmax(prediction.predictions, axis=-1)
preds

array([1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,

In [35]:
import evaluate

metric = evaluate.load("glue",'mrpc')
metric.compute (predictions=preds, references=prediction.label_ids)

{'accuracy': 0.8235294117647058, 'f1': 0.8754325259515572}

In [9]:
import evaluate
import numpy as np

def evaluate_def(eval_prediction):
    metric = evaluate.load ('glue', 'mrpc')
    logits, labels = eval_prediction
    prediction = np.argmax (logits, axis=-1)
    return metric.compute (predictions=prediction, references=labels)

training_args = TrainingArguments ('test-trainer', evaluation_strategy='epoch')
model = AutoModelForSequenceClassification.from_pretrained (checkpoint, num_labels=2)

model.to (torch.device('cuda'))


trainer = Trainer (
    model,
    training_args,
    train_dataset = tain_data_tockenized,
    eval_dataset  = valid_dataset_tok,
    data_collator=data_collector_padding,
    tokenizer=tokenizer,
    compute_metrics=evaluate_def
)

trainer.train()
model.to (torch.device('cpu'))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1377 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.3995857238769531, 'eval_accuracy': 0.8455882352941176, 'eval_f1': 0.8852459016393444, 'eval_runtime': 18.0861, 'eval_samples_per_second': 22.559, 'eval_steps_per_second': 2.82, 'epoch': 1.0}
{'loss': 0.5114, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.5006976127624512, 'eval_accuracy': 0.8504901960784313, 'eval_f1': 0.8981636060100167, 'eval_runtime': 13.5169, 'eval_samples_per_second': 30.184, 'eval_steps_per_second': 3.773, 'epoch': 2.0}
{'loss': 0.3037, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.591132402420044, 'eval_accuracy': 0.8602941176470589, 'eval_f1': 0.9012131715771231, 'eval_runtime': 15.2928, 'eval_samples_per_second': 26.679, 'eval_steps_per_second': 3.335, 'epoch': 3.0}
{'train_runtime': 735.9119, 'train_samples_per_second': 14.953, 'train_steps_per_second': 1.871, 'train_loss': 0.33517682941910487, 'epoch': 3.0}


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [10]:
model.to (torch.device('cuda'))

trainer.train()

  0%|          | 0/1377 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.8026039004325867, 'eval_accuracy': 0.8480392156862745, 'eval_f1': 0.8904593639575972, 'eval_runtime': 18.6593, 'eval_samples_per_second': 21.866, 'eval_steps_per_second': 2.733, 'epoch': 1.0}
{'loss': 0.1647, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.8136206865310669, 'eval_accuracy': 0.8578431372549019, 'eval_f1': 0.8986013986013986, 'eval_runtime': 38.9784, 'eval_samples_per_second': 10.467, 'eval_steps_per_second': 1.308, 'epoch': 2.0}
{'loss': 0.0837, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


  0%|          | 0/51 [00:00<?, ?it/s]

{'eval_loss': 0.9537803530693054, 'eval_accuracy': 0.8504901960784313, 'eval_f1': 0.8924162257495591, 'eval_runtime': 11.251, 'eval_samples_per_second': 36.263, 'eval_steps_per_second': 4.533, 'epoch': 3.0}
{'train_runtime': 730.3431, 'train_samples_per_second': 15.067, 'train_steps_per_second': 1.885, 'train_loss': 0.0963304640161186, 'epoch': 3.0}


TrainOutput(global_step=1377, training_loss=0.0963304640161186, metrics={'train_runtime': 730.3431, 'train_samples_per_second': 15.067, 'train_steps_per_second': 1.885, 'train_loss': 0.0963304640161186, 'epoch': 3.0})

# Training Dataset Without Train API

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, get_scheduler
import torch

checkpoint = 'bert-base-uncased'

raw_dataset = load_dataset ('glue','mrpc')

In [2]:
tokenizer = AutoTokenizer.from_pretrained (checkpoint)

train_dataset = raw_dataset['train']
valid_dataset = raw_dataset['validation']

model = AutoModelForSequenceClassification.from_pretrained (checkpoint, num_labels = 2 )
data_collector_padding = DataCollatorWithPadding(tokenizer = tokenizer)

model.parameters

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<bound method Module.parameters of BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Layer

In [3]:
train_dataset[:3]

{'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
  "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
  'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .'],
 'sentence2': ['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
  "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
  "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale ."],
 'label': [1, 0, 1],
 'idx': [0, 1, 2]}

In [3]:
def to_tokenize (dataset):
    return tokenizer (dataset['sentence1'], dataset['sentence2'], truncation=True )

In [4]:
train_tockenized = train_dataset.map (to_tokenize, batched = True)
valid_tockenized = valid_dataset.map (to_tokenize, batched = True)
train_tockenized


Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [5]:
# Removing exta columns from train and validate datset

train_tock_final = train_tockenized.remove_columns ( [ 'sentence1', 'sentence2', 'idx' ] ).rename_column('label','labels')
valid_tock_fianl = valid_tockenized.remove_columns ( [ 'sentence1', 'sentence2', 'idx' ] ).rename_column('label','labels')
train_tock_final

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

Lets try to tokenize and rename all dataset form the raw dataset

In [7]:
token_dataset = raw_dataset.map (to_tokenize, batched=True)
token_dataset

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [12]:
fianl_dataset = token_dataset.remove_columns( [ 'sentence1', 'sentence2', 'idx' ] ).rename_column('label','labels')
fianl_dataset.set_format('torch')
fianl_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [6]:
# load data set from toch ulils
from torch.utils.data import DataLoader

train_dataset = DataLoader(
    dataset= train_tock_final, shuffle=True, batch_size=8, collate_fn=data_collector_padding
)

valid_dataset = DataLoader(
    dataset=valid_tock_fianl, batch_size=8, collate_fn=data_collector_padding
)

len (train_dataset)


459

In [15]:
batch = next(iter(train_dataset))

print (batch.keys())

{k: v.shape for k,v in batch.items() }

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask'])


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 72]),
 'token_type_ids': torch.Size([8, 72]),
 'attention_mask': torch.Size([8, 72])}

In [16]:
model.to (torch.device('cpu'))
output = model(**batch)
output

SequenceClassifierOutput(loss=tensor(0.6648, grad_fn=<NllLossBackward0>), logits=tensor([[-0.6965, -0.0207],
        [-0.7268, -0.0159],
        [-0.7273, -0.0391],
        [-0.7150, -0.0420],
        [-0.7209, -0.0421],
        [-0.7176, -0.0350],
        [-0.7223, -0.0416],
        [-0.7038, -0.0153]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [7]:
optimizer = torch.optim.AdamW (model.parameters(), lr=5e-5 )
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)

In [18]:
epochs = 3
training_steps = epochs * len(train_dataset)

scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=training_steps
)

scheduler

<torch.optim.lr_scheduler.LambdaLR at 0x1d13f704550>

In [19]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to (device)
device

device(type='cuda')

In [None]:
torch.cuda.empty_cache()

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(training_steps))
model.to(torch.device('cpu'))
model.to(torch.device('cuda'))

model.train()
for epoch in range(epochs):
    for batch in train_dataset:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

model.to(torch.device('cpu'))

  0%|          | 0/1377 [00:00<?, ?it/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [9]:
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
from tqdm.auto import tqdm

accelerator = Accelerator()

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataset, valid_dataset, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1377 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
