<a href="https://colab.research.google.com/github/SpencerPao/Natural-Language-Processing/blob/main/Question%20Answering%20Modeling/Question_Answering_Modeling_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers
!pip install datasets
!pip install transformers[torch] accelerate -U
!pip install evaluate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00

In [4]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"

nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'Why is model conversion important?',
    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
}
res = nlp(QA_input)

model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


# RoBERTa

In [5]:
from datasets import load_dataset
squad = load_dataset("squad")

Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [6]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [7]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_dataset

sampled_train_dataset = squad["train"].shuffle(seed=42).select(range(10000))
train_dataset, val_dataset = train_test_split(sampled_train_dataset, test_size=0.2, random_state=42)

train_dataset = Dataset.from_dict(train_dataset)
val_dataset = Dataset.from_dict(val_dataset)

tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [8]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()

In [17]:
from transformers import RobertaConfig
config = RobertaConfig.from_pretrained(model_name)
config.hidden_dropout_prob = 0.1 
config.attention_probs_dropout_prob = 0.1  


# GPU TRAINING


In [18]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_name, config=config)

In [10]:
import numpy as np

# PERBANDINGAN 1 (PAKAI LOAD_METRICS)

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_dataset, load_metric
from transformers import AdamW, get_scheduler, TrainingArguments, Trainer
import numpy as np
import evaluate

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

def compute_metrics(p):
    start_logits, end_logits = p.predictions
    start_pred = np.argmax(start_logits, axis=1)
    end_pred = np.argmax(end_logits, axis=1)

    true_start, true_end = p.label_ids
    start_accuracy = accuracy_metric.compute(predictions=start_pred, references=true_start)
    end_accuracy = accuracy_metric.compute(predictions=end_pred, references=true_end)

    start_precision = precision_metric.compute(predictions=start_pred, references=true_start, average='macro')
    end_precision = precision_metric.compute(predictions=end_pred, references=true_end, average='macro')

    start_recall = recall_metric.compute(predictions=start_pred, references=true_start, average='macro')
    end_recall = recall_metric.compute(predictions=end_pred, references=true_end, average='macro')

    start_f1 = f1_metric.compute(predictions=start_pred, references=true_start, average='macro')
    end_f1 = f1_metric.compute(predictions=end_pred, references=true_end, average='macro')

    combined_accuracy = (start_accuracy['accuracy'] + end_accuracy['accuracy']) / 2
    combined_precision = (start_precision['precision'] + end_precision['precision']) / 2
    combined_recall = (start_recall['recall'] + end_recall['recall']) / 2
    combined_f1 = (start_f1['f1'] + end_f1['f1']) / 2

    return {
        "accuracy": combined_accuracy,
        "precision": combined_precision,
        "recall": combined_recall,
        "f1": combined_f1,
    }

optimizer = AdamW(model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(tokenized_train) * training_args.num_train_epochs)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
)

trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1645,1.379646,0.74375,0.722973,0.719699,0.707256
2,0.2144,1.161762,0.7515,0.720218,0.71821,0.70407
3,0.2527,1.570169,0.74075,0.720861,0.71816,0.703419
4,0.1941,1.404456,0.7475,0.721226,0.718407,0.704522
5,0.1674,1.616174,0.74,0.712178,0.706691,0.695015


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

TrainOutput(global_step=2500, training_loss=0.19861561889648438, metrics={'train_runtime': 3190.1145, 'train_samples_per_second': 12.539, 'train_steps_per_second': 0.784, 'total_flos': 7838902702080000.0, 'train_loss': 0.19861561889648438, 'epoch': 5.0})

# PERBANDINGAN 2 (PAKAI SKLEARN.METRICS)

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_dataset, load_metric
from transformers import AdamW, get_scheduler, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

accuracy_metric = load_metric("accuracy")

def compute_metrics(p):
    start_logits, end_logits = p.predictions
    start_pred = np.argmax(start_logits, axis=1)
    end_pred = np.argmax(end_logits, axis=1)

    true_start, true_end = p.label_ids
    start_accuracy = accuracy_metric.compute(predictions=start_pred, references=true_start)
    end_accuracy = accuracy_metric.compute(predictions=end_pred, references=true_end)

    start_precision, start_recall, start_f1, _ = precision_recall_fscore_support(true_start, start_pred, average='macro', zero_division=1)
    end_precision, end_recall, end_f1, _ = precision_recall_fscore_support(true_end, end_pred, average='macro', zero_division=1)

    combined_accuracy = (start_accuracy['accuracy'] + end_accuracy['accuracy']) / 2
    combined_precision = (start_precision + end_precision) / 2
    combined_recall = (start_recall + end_recall) / 2
    combined_f1 = (start_f1 + end_f1) / 2

    return {
        "accuracy": combined_accuracy,
        "precision": combined_precision,
        "recall": combined_recall,
        "f1": combined_f1,
    }

optimizer = AdamW(model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(tokenized_train) * training_args.num_train_epochs)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
)

trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0903,2.240967,0.7305,0.748975,0.746486,0.705343
2,0.1147,2.616011,0.7345,0.748834,0.74343,0.708569
3,0.1244,2.675136,0.7175,0.709403,0.718617,0.675813


TrainOutput(global_step=1500, training_loss=0.10980621337890625, metrics={'train_runtime': 1984.5861, 'train_samples_per_second': 12.093, 'train_steps_per_second': 0.756, 'total_flos': 4703341621248000.0, 'train_loss': 0.10980621337890625, 'epoch': 3.0})

# PERBANDINGAN 3 BATCH SIZE 8, ADDING GRADIENT ACCUMULATION STEPS 2, LOWER LEARNING RATE

In [12]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_dataset, load_metric
from transformers import AdamW, get_scheduler, TrainingArguments, Trainer, EarlyStoppingCallback, BertForQuestionAnswering
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=2,

)

accuracy_metric = load_metric("accuracy")

def compute_metrics(p):
    start_logits, end_logits = p.predictions
    start_pred = np.argmax(start_logits, axis=1)
    end_pred = np.argmax(end_logits, axis=1)

    true_start, true_end = p.label_ids
    start_accuracy = accuracy_metric.compute(predictions=start_pred, references=true_start)
    end_accuracy = accuracy_metric.compute(predictions=end_pred, references=true_end)

    start_precision, start_recall, start_f1, _ = precision_recall_fscore_support(true_start, start_pred, average='macro', zero_division=1)
    end_precision, end_recall, end_f1, _ = precision_recall_fscore_support(true_end, end_pred, average='macro', zero_division=1)

    combined_accuracy = (start_accuracy['accuracy'] + end_accuracy['accuracy']) / 2
    combined_precision = (start_precision + end_precision) / 2
    combined_recall = (start_recall + end_recall) / 2
    combined_f1 = (start_f1 + end_f1) / 2

    return {
        "accuracy": combined_accuracy,
        "precision": combined_precision,
        "recall": combined_recall,
        "f1": combined_f1,
    }

optimizer = AdamW(model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(tokenized_train) * training_args.num_train_epochs)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
)

trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7859,0.653536,0.79625,0.803802,0.805461,0.778986
2,0.5225,0.736596,0.78875,0.791792,0.79153,0.756508
3,0.3553,0.906031,0.7695,0.777048,0.770958,0.73345


TrainOutput(global_step=1500, training_loss=0.5545942891438802, metrics={'train_runtime': 1948.9066, 'train_samples_per_second': 12.315, 'train_steps_per_second': 0.77, 'total_flos': 4703341621248000.0, 'train_loss': 0.5545942891438802, 'epoch': 3.0})

# PERBANDINGAN 4: EVALUATION STRATEGY = STEPS, EVAL STEPS 500, WARMUP STEPS 500

In [13]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_dataset, load_metric
from transformers import AdamW, get_scheduler, TrainingArguments, Trainer, EarlyStoppingCallback, BertForQuestionAnswering
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",   
    eval_steps=500,               
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=2,
    warmup_steps=500,             
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
)

trainer.train()



Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.1619,1.224885,0.7705,0.775615,0.772343,0.741632
1000,0.1112,1.352888,0.76825,0.77985,0.771593,0.73445
1500,0.0931,1.739122,0.765,0.775004,0.760971,0.71838


TrainOutput(global_step=1500, training_loss=0.12206663767496745, metrics={'train_runtime': 1991.8386, 'train_samples_per_second': 12.049, 'train_steps_per_second': 0.753, 'total_flos': 4703341621248000.0, 'train_loss': 0.12206663767496745, 'epoch': 3.0})

# PERBANDINGAN 5: LOWER LEARNING RATE, LOWER WEIGHT DECAY, INCREASE GRADIENT ACCUMULATION AND WARMUP STEPS, AND INCREASING DROPOUT RATE

In [19]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_dataset, load_metric
from transformers import AdamW, get_scheduler, TrainingArguments, Trainer, EarlyStoppingCallback, BertForQuestionAnswering
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps", 
    eval_steps=500,
    learning_rate=1e-5,  
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.005,  
    gradient_accumulation_steps=4,  
    warmup_steps=1000,           
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
)

trainer.train()



Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,1.11,0.653521,0.79175,0.835008,0.79479,0.783289


TrainOutput(global_step=750, training_loss=1.1122729899088541, metrics={'train_runtime': 1777.3416, 'train_samples_per_second': 13.503, 'train_steps_per_second': 0.422, 'total_flos': 4703341621248000.0, 'train_loss': 1.1122729899088541, 'epoch': 3.0})

In [None]:
!zip -r /content/model.zip /content/results/checkpoint-17000

  adding: content/results/checkpoint-17000/ (stored 0%)
  adding: content/results/checkpoint-17000/optimizer.pt
zip I/O error: No space left on device
zip error: Output file write failure (write error on zip file)
