<a href="https://colab.research.google.com/github/srabinarayan/LLM---Finetuning/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



### Fine tuning Language Modelling


In [84]:
!pip install transformers datasets



[Auto Model API](https://huggingface.co/transformers/v3.0.2/model_doc/auto.html#automodel)


*   transformer -> model -> pretrain -> BERT -> question answering , text classification
*   transformer -> transformer -> TFAutomodel ->low level authority




In [85]:
import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_dataset
from transformers import TrainingArguments, Trainer

### Loading Pre Trained Model

In [88]:
squad = load_dataset("squad")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Inferencing without Tuning

In [78]:
text = "Which NFL team represented the AFC at Super Bowl 50?"
context = "Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the 'golden anniversary' with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as 'Super Bowl L'), so that the logo could prominently feature the Arabic numerals 50."
token_encoding = tokenizer(text,context,return_tensors="pt")
input_ids = token_encoding['input_ids']
attention_mask = token_encoding['attention_mask']
token_encoding

{'input_ids': tensor([[  101,  2029,  5088,  2136,  3421,  1996, 10511,  2012,  3565,  4605,
          2753,  1029,   102,  3565,  4605,  2753,  2001,  2019,  2137,  2374,
          2208,  2000,  5646,  1996,  3410,  1997,  1996,  2120,  2374,  2223,
          1006,  5088,  1007,  2005,  1996,  2325,  2161,  1012,  1996,  2137,
          2374,  3034,  1006, 10511,  1007,  3410,  7573, 14169,  3249,  1996,
          2120,  2374,  3034,  1006, 22309,  1007,  3410,  3792, 12915,  2484,
          1516,  2184,  2000,  7796,  2037,  2353,  3565,  4605,  2516,  1012,
          1996,  2208,  2001,  2209,  2006,  2337,  1021,  1010,  2355,  1010,
          2012, 11902,  1005,  1055,  3346,  1999,  1996,  2624,  3799,  3016,
          2181,  2012,  4203, 10254,  1010,  2662,  1012,  2004,  2023,  2001,
          1996, 12951,  3565,  4605,  1010,  1996,  2223, 13155,  1996,  1005,
          3585,  5315,  1005,  2007,  2536,  2751,  1011, 11773, 11107,  1010,
          2004,  2092,  2004,  8184, 2

In [79]:
## Inferencing
with torch.no_grad(): # Means gradient calculation will not be done
  output = model(input_ids,attention_mask)
answer_start_index = output.start_logits.argmax()
answer_end_index = output.end_logits.argmax()
predict_answer_tokens = token_encoding.input_ids[0,answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

's stadium in the san francisco bay area at santa'

### Fine tuning

In [75]:
squad['validation'][0]

{'id': '56be4db0acb8001400a502ec',
 'title': 'Super_Bowl_50',
 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',
 'question': 'Which NFL team represented the AFC at Super Bowl 50?',
 'answers': {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'],


In [None]:
# Tokenize the data
questions = squad['validation'][0]["question"].strip()
inputs = tokenizer(
        questions,
        squad['validation'][0]["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        return_overflowing_tokens=True,
        padding="max_length",
  )
offset_mapping = inputs
answers = squad['validation'][0]["answers"]
start_positions = []
end_positions = []
questions,inputs,answers

In [87]:
def preprocess_function(examples):
    # questions = [q.strip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        max_length=384,
        stride = 128,
        truncation="only_second", # truncate the context not the question
        return_offsets_mapping=True,
        return_overflowing_tokens=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop('overflow_to_sample_mapping')
    offset_mapping = tokenized_examples.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):

      input_ids = tokenized_examples['input_ids'][i]
      cls_index = input_ids.index(tokenizer.cls_token_id)
      sequence_ids = tokenized_examples.sequence_ids(i)

      sample_index = sample_mapping[i]
      answer = examples["answers"][sample_index]

      if len(answer["answer_start"]) == 0:
        start_positions.append(cls_index)
        end_positions.append(cls_index)
      else:
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])


        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        if not (offset[token_start_index][0] <= start_char and offset[token_end_index][1] >= end_char):
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
          while token_start_index < len(offset) and offset[token_start_index][0] <= start_char:
            token_start_index += 1
          start_positions.append(token_start_index - 1)
          while offset[token_end_index][1] >= end_char:
            token_end_index -= 1
          end_positions.append(token_end_index + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions


    return tokenized_examples

In [89]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [98]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator() # create a batch of examples

# Initialise Model configuration
trainig_args = TrainingArguments(
    output_dir='./finetune-bert-uncased',          # output directory
    eval_strategy="epoch",
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    report_to="none"
)
# Initialise Model trainer object
trainer = Trainer(
    model = model,
    args = trainig_args,
    train_dataset = tokenized_squad['train'].select(range(100)),
    eval_dataset = tokenized_squad['validation'].select(range(100)),
    data_collator = data_collator,
    # tokenizer = tokenizer
)

In [99]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,3.930314


TrainOutput(global_step=13, training_loss=4.535487835223858, metrics={'train_runtime': 627.516, 'train_samples_per_second': 0.159, 'train_steps_per_second': 0.021, 'total_flos': 19597256755200.0, 'train_loss': 4.535487835223858, 'epoch': 1.0})