In [219]:
!pip install --upgrade transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [220]:
!pip show transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Name: transformers
Version: 4.53.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /Users/jessicahong/.pyenv/versions/3.11.11/lib/python3.11/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers, trl


In [221]:
from transformers import TrainingArguments
print(TrainingArguments.__module__)

transformers.training_args


In [222]:
#Loading Data use SQuAD v2)

In [223]:
from datasets import load_dataset
raw_dataset = load_dataset("squad_v2")

In [224]:
#preprocessing (prepare_features_with_labels)

In [225]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [226]:
def tokenize_inputs(examples):
    """
    Tokenize the input examples for question answering.
    
    Args:
        examples (dict): A batch of examples containing 'question' and 'context' keys.
        
    Returns:
        tokenized_examples (BatchEncoding): Tokenized inputs with overflow handling.
        sample_mapping (list[int]): Maps tokenized chunks back to original examples.
        offset_mapping (list[list[tuple[int, int]]]): Character start/end positions for each token.
    """
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",      # Truncate only the context (second sequence=context)
        max_length=384,                # Max sequence length
        stride=128,                   # Overlap tokens for long contexts
        return_overflowing_tokens=True,
        return_offsets_mapping=True,  # Provide character offsets for tokens
        padding="max_length"          # Pad to max_length
    )
    
    # Extract auxiliary info for mapping tokenized chunks back to original examples
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    
    return tokenized_examples, sample_mapping, offset_mapping


In [254]:
#To confirm

In [255]:
example = {
    "id": ["1"],
    "question": ["When did Beyoncé become popular?"],
    "context": ["Beyoncé Giselle Knowles-Carter was born in 1981 and became famous in the late 1990s as part of Destiny's Child."]
}

In [256]:
tokenized, sample_mapping, offset_mapping = tokenize_inputs(example)

In [259]:
from pprint import pprint
print("Input IDs:")
pprint(tokenized["input_ids"][0][:10])

Input IDs:
[101, 2043, 2106, 20773, 2468, 2759, 1029, 102, 20773, 21025]


In [261]:
print("\nSample Mapping:")
pprint(sample_mapping)


Sample Mapping:
[0]


In [262]:
print("\nOffset Mapping:")
pprint(offset_mapping[0][:10]) 


Offset Mapping:
[(0, 0),
 (0, 4),
 (5, 8),
 (9, 16),
 (17, 23),
 (24, 31),
 (31, 32),
 (0, 0),
 (0, 7),
 (8, 10)]


In [263]:
print("\nDecoded Tokens:")
print(tokenizer.decode(tokenized["input_ids"][0])


Decoded Tokens:
[CLS] when did beyonce become popular? [SEP] beyonce giselle knowles - carter was born in 1981 and became famous in the late 1990s as part of destiny ' s child. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [265]:
#def prepare_features_with_labels(examples) identifies the start and positions of the answer within the tokenized inuts and assigns them as labels for training

In [264]:
def prepare_features_with_labels(examples):
    """
    Tokenizes the input examples and computes the start and end token positions 
    of the answer within the context, suitable for training a QA model.

    Args:
        examples (dict): A batch of examples containing 'question', 'context', and 'answers'.

    Returns:
        tokenized_examples (dict): Tokenized input with added 'start_positions' and 'end_positions' for training.
    """
    # Tokenize examples using sliding window and truncating only the context (second sequence)
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Extract mappings to link back to the original examples
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    start_positions = []
    end_positions = []

    #offsets : information for the current tokenized feature
    #Each token is mapped to its corresponding character span in the original context
    #i = the index of the current tokenized feature
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i] #The list of token IDs for the current tokenized feature.
        cls_index = input_ids.index(tokenizer.cls_token_id) #The position of the [CLS] token

        # sequence_ids: 0 = question, 1 = context, None = special tokens (CLS, SEP, etc.)
        """
        sequence_ids is a list that indicates which part of the input each token comes from.
        0 means the token belongs to the question
        1 means the token belongs to the context
        None means the token is a special token (e.g., [CLS], [SEP], [PAD]).
        """
        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:  
        # When the question has no answers (unanswerable question)
        # Since there is no answer, set the start and end positions to the [CLS] token index
        # This helps the model learn to predict "no answer" by pointing to the [CLS] token
            start_positions.append(cls_index)  # Start position set to CLS token index
            end_positions.append(cls_index)    # End position also set to CLS token index
        else:
        # When there is an answer, get the character start index of the answer text in the context
            start_char = answers["answer_start"][0]             # Character index where answer starts
            end_char = start_char + len(answers["text"][0])     # Character index where answer ends (start + length of answer)

            # Find the start of the context tokens
            token_start_index = 0
            # Iterate until we find the first token that belongs to the context
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # Find the end of the context tokens
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                # Move to the next token index
                token_end_index -= 1
                # After the loop, token_start_index points to the first token of the context sequence

            # If the answer is outside the span of context tokens, set positions to CLS
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                # Find the start token index corresponding to the answer
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)

                # Find the end token index corresponding to the answer
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    # Attach start and end positions to tokenized examples
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples


In [267]:
for i in range(1):
    tokens = tokenizer.convert_ids_to_tokens(tokenized_examples["input_ids"][i])
    seq_ids = tokenized_examples.sequence_ids(i)
    print(f"Feature {i}:")
    for token, seq_id in zip(tokens, seq_ids):
        print(f"  {token}: {seq_id}")
    print()


Feature 0:
  [CLS]: None
  when: 0
  did: 0
  beyonce: 0
  start: 0
  becoming: 0
  popular: 0
  ?: 0
  [SEP]: None
  beyonce: 1
  gi: 1
  ##selle: 1
  knowles: 1
  -: 1
  carter: 1
  (: 1
  /: 1
  bi: 1
  ##ː: 1
  ##ˈ: 1
  ##j: 1
  ##ɒ: 1
  ##nse: 1
  ##ɪ: 1
  /: 1
  bee: 1
  -: 1
  yo: 1
  ##n: 1
  -: 1
  say: 1
  ): 1
  (: 1
  born: 1
  september: 1
  4: 1
  ,: 1
  1981: 1
  ): 1
  is: 1
  an: 1
  american: 1
  singer: 1
  ,: 1
  songwriter: 1
  ,: 1
  record: 1
  producer: 1
  and: 1
  actress: 1
  .: 1
  born: 1
  and: 1
  raised: 1
  in: 1
  houston: 1
  ,: 1
  texas: 1
  ,: 1
  she: 1
  performed: 1
  in: 1
  various: 1
  singing: 1
  and: 1
  dancing: 1
  competitions: 1
  as: 1
  a: 1
  child: 1
  ,: 1
  and: 1
  rose: 1
  to: 1
  fame: 1
  in: 1
  the: 1
  late: 1
  1990s: 1
  as: 1
  lead: 1
  singer: 1
  of: 1
  r: 1
  &: 1
  b: 1
  girl: 1
  -: 1
  group: 1
  destiny: 1
  ': 1
  s: 1
  child: 1
  .: 1
  managed: 1
  by: 1
  her: 1
  father: 1
  ,: 1
  mathew: 1
  knowles: 

In [229]:
#subset = raw_dataset["train"].select(range(5))

In [230]:
#print(subset)

In [231]:
tokenized_examples, sample_mapping, offset_mapping = tokenize_inputs(subset)
processed_data = add_token_labels(tokenized_examples, sample_mapping, offset_mapping, subset)

In [268]:
print("input_ids (first sample):", processed_data["input_ids"][0])
print("start_position:", processed_data["start_positions"][0])
print("end_position:", processed_data["end_positions"][0])

input_ids (first sample): [101, 2043, 2106, 20773, 2707, 3352, 2759, 1029, 102, 20773, 21025, 19358, 22815, 1011, 5708, 1006, 1013, 12170, 23432, 29715, 3501, 29678, 12325, 29685, 1013, 10506, 1011, 10930, 2078, 1011, 2360, 1007, 1006, 2141, 2244, 1018, 1010, 3261, 1007, 2003, 2019, 2137, 3220, 1010, 6009, 1010, 2501, 3135, 1998, 3883, 1012, 2141, 1998, 2992, 1999, 5395, 1010, 3146, 1010, 2016, 2864, 1999, 2536, 4823, 1998, 5613, 6479, 2004, 1037, 2775, 1010, 1998, 3123, 2000, 4476, 1999, 1996, 2397, 4134, 2004, 2599, 3220, 1997, 1054, 1004, 1038, 2611, 1011, 2177, 10461, 1005, 1055, 2775, 1012, 3266, 2011, 2014, 2269, 1010, 25436, 22815, 1010, 1996, 2177, 2150, 2028, 1997, 1996, 2088, 1005, 1055, 2190, 1011, 4855, 2611, 2967, 1997, 2035, 2051, 1012, 2037, 14221, 2387, 1996, 2713, 1997, 20773, 1005, 1055, 2834, 2201, 1010, 20754, 1999, 2293, 1006, 2494, 1007, 1010, 2029, 2511, 2014, 2004, 1037, 3948, 3063, 4969, 1010, 3687, 2274, 8922, 2982, 1998, 2956, 1996, 4908, 2980, 2531, 2193, 10

In [233]:
#Training Model

In [269]:
tokenized_dataset = raw_dataset.map(
    function=prepare_features_with_labels,  # Apply the function that tokenizes inputs and sets start/end labels
    batched=True,                          # Process the dataset in batches for efficiency
    remove_columns=raw_dataset["train"].column_names  # Remove original columns after processing to keep only tokenized features
)

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [235]:
#Load Model

In [270]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# 1. Set the model name or identifier
model_id = "distilbert-base-uncased"

# 2. Load the tokenizer associated with the model
tokenizer = AutoTokenizer.from_pretrained(model_id)

# 3. Load the pre-trained Question Answering model
base_model = AutoModelForQuestionAnswering.from_pretrained(model_id)



Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [237]:
#LoRA setting

In [271]:
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForQuestionAnswering

# 1. Load the pre-trained DistilBERT model for Question Answering
base_model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

# 2. Configure LoRA (Low-Rank Adaptation)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_lin", "v_lin"],  # Modules in DistilBERT where LoRA will be applied
    lora_dropout=0.1,
    task_type="QUESTION_ANS"     # Task type set as a string for version compatibility
)

# 3. Create a PEFT model by applying LoRA to the base model
model = get_peft_model(base_model, lora_config)


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [274]:
from peft import TaskType
print(list(TaskType))

SyntaxError: invalid syntax (4012217190.py, line 2)

In [273]:
#D

In [275]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./lora_qa",                # Directory to save model checkpoints and outputs
    per_device_train_batch_size=8,         # Batch size for training on each device (GPU/CPU)
    per_device_eval_batch_size=8,          # Batch size for evaluation on each device
    num_train_epochs=2,                    # Number of training epochs
    logging_steps=50,                      # Log training info every 50 steps
    do_eval=True,                         # Enable evaluation during training
    eval_steps=500,                       # Evaluate the model every 500 steps
    save_steps=500,                       # Save a checkpoint every 500 steps
    save_total_limit=2,                   # Maximum number of checkpoints to keep
    remove_unused_columns=False           # Keep all columns from the dataset (avoid removing unused columns)
)


In [276]:
model.print_trainable_parameters()

trainable params: 148,994 || all params: 66,513,412 || trainable%: 0.2240


In [277]:
#Trainer

In [278]:
from transformers import Trainer, default_data_collator

# Trainer 객체 정의
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(1000)),
    eval_dataset=tokenized_dataset["validation"].select(range(500)),
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
50,5.9116
100,5.7984
150,5.6824
200,5.5732
250,5.5002


TrainOutput(global_step=250, training_loss=5.69317431640625, metrics={'train_runtime': 66.3123, 'train_samples_per_second': 30.16, 'train_steps_per_second': 3.77, 'total_flos': 196666214400000.0, 'train_loss': 5.69317431640625, 'epoch': 2.0})

In [245]:
print(tokenized_dataset["train"][0])

{'input_ids': [101, 2043, 2106, 20773, 2707, 3352, 2759, 1029, 102, 20773, 21025, 19358, 22815, 1011, 5708, 1006, 1013, 12170, 23432, 29715, 3501, 29678, 12325, 29685, 1013, 10506, 1011, 10930, 2078, 1011, 2360, 1007, 1006, 2141, 2244, 1018, 1010, 3261, 1007, 2003, 2019, 2137, 3220, 1010, 6009, 1010, 2501, 3135, 1998, 3883, 1012, 2141, 1998, 2992, 1999, 5395, 1010, 3146, 1010, 2016, 2864, 1999, 2536, 4823, 1998, 5613, 6479, 2004, 1037, 2775, 1010, 1998, 3123, 2000, 4476, 1999, 1996, 2397, 4134, 2004, 2599, 3220, 1997, 1054, 1004, 1038, 2611, 1011, 2177, 10461, 1005, 1055, 2775, 1012, 3266, 2011, 2014, 2269, 1010, 25436, 22815, 1010, 1996, 2177, 2150, 2028, 1997, 1996, 2088, 1005, 1055, 2190, 1011, 4855, 2611, 2967, 1997, 2035, 2051, 1012, 2037, 14221, 2387, 1996, 2713, 1997, 20773, 1005, 1055, 2834, 2201, 1010, 20754, 1999, 2293, 1006, 2494, 1007, 1010, 2029, 2511, 2014, 2004, 1037, 3948, 3063, 4969, 1010, 3687, 2274, 8922, 2982, 1998, 2956, 1996, 4908, 2980, 2531, 2193, 1011, 2028, 38

In [246]:
#Evaluation

In [279]:
eval_results = trainer.evaluate()
print(eval_results)



{'eval_loss': 5.581022262573242, 'eval_runtime': 7.7393, 'eval_samples_per_second': 64.605, 'eval_steps_per_second': 8.14, 'epoch': 2.0}


In [280]:
from sklearn.metrics import accuracy_score

# 예측값 리스트, 정답 리스트 (둘 다 텍스트)
predictions = ["in the late 1990s", "another answer", "some answer"]
references = ["in the late 1990s", "correct answer", "some answer"]

# 정확도 계산
acc = accuracy_score(references, predictions)
print(f"Accuracy: {acc:.4f}")


Accuracy: 0.6667


In [281]:
import re
import string
from collections import Counter

def normalize_text(text):
    """텍스트 정규화: 소문자, 구두점 제거, 공백 정리 등"""
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = " ".join(text.split())
    return text

def compute_exact(a_gold, a_pred):
    return int(normalize_text(a_gold) == normalize_text(a_pred))

def compute_f1(a_gold, a_pred):
    gold_toks = normalize_text(a_gold).split()
    pred_toks = normalize_text(a_pred).split()
    common = Counter(gold_toks) & Counter(pred_toks)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_toks)
    recall = num_same / len(gold_toks)
    f1 = 2 * precision * recall / (precision + recall)
    return f1

# 예시 리스트
predictions = ["in the late 1990s", "another answer", "some answer"]
references = ["in the late 1990s", "correct answer", "some answer"]

em_scores = [compute_exact(ref, pred) for ref, pred in zip(references, predictions)]
f1_scores = [compute_f1(ref, pred) for ref, pred in zip(references, predictions)]

print(f"Exact Match: {sum(em_scores)/len(em_scores):.4f}")
print(f"F1 Score: {sum(f1_scores)/len(f1_scores):.4f}")


Exact Match: 0.6667
F1 Score: 0.8333
