In [1]:
!pip install transformers datasets peft accelerate



In [2]:
"""
Project goal :To fine-tune a lightweight transformer model (distilbert-base-uncased) for extractive question answering using the LoRA (Low-Rank Adaptation) technique, with the aim of optimizing efficiency and reducing computational overhead during model adaptation.
"""


'\nProject goal :To fine-tune a lightweight transformer model (distilbert-base-uncased) for extractive question answering using the LoRA (Low-Rank Adaptation) technique, with the aim of optimizing efficiency and reducing computational overhead during model adaptation.\n'

In [3]:
#Key  objective
#Apply LoRA to enable parameter-efficient fine-tuning.
#Train a lightweight QA model with minimal resources.
#Maintain performance while reducing memory and storage requirements.
#Maintain performance while reducing memory and storage requirements.

In [4]:
#Data Loading (SQuAD v1.1)

In [5]:
# 1. Import required libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, default_data_collator
from peft import LoraConfig, get_peft_model, TaskType

In [6]:
# 2. Load SQuAD dataset (question answering dataset
#Standford Question Answering Dataset
raw_dataset = load_dataset("squad")

In [7]:
#Model and Tokenizer

In [8]:
# 3. Load tokenizer and pre-trained DistilBERT QA model
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
model_id = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForQuestionAnswering.from_pretrained(model_id)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
#Lora configuration

In [10]:
# Define LoRA configuration for parameter-efficient fine-tuning
lora_config = LoraConfig(
    r=8,                          # LoRA rank
    lora_alpha=16,                # LoRA scaling factor
    target_modules=["q_lin", "v_lin"],  # Target modules in DistilBERT to apply LoRA
    lora_dropout=0.1,             # Dropout probability for LoRA layers
    task_type="QUESTION_ANS"      # Task type: Question Answering
)  # Close the LoraConfig constructor

In [11]:
# Wrap the pre-trained model with LoRA configuration
model = get_peft_model(model, lora_config)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [12]:
#prepare_features_with_labels
"""
Tokenize the question and context to create input data,
Generate labels (start_positions, end_positions) that indicate the token indices where the answer text appears within the context.
"""

'\nTokenize the question and context to create input data,\nGenerate labels (start_positions, end_positions) that indicate the token indices where the answer text appears within the context.\n'

In [13]:
#Tokenization and mapping
"""
Create two functions 
   1) tokenize_examples(examples) : Tokenizes the question and context
    and returns the tokenized outputs along with:
        -sample_mapping: mapping from each tokenized chunk back to the original example
        -offset_mapping: character-level positions of each token in the original context

    2)add_answer_positions(...)
    Converts the answer's character-level start and end positions
    into token-level start and end indices,
    and adds them to the tokenized data as start_positions and end_positions.
        
"""

"\nCreate two functions \n   1) tokenize_examples(examples) : Tokenizes the question and context\n    and returns the tokenized outputs along with:\n        -sample_mapping: mapping from each tokenized chunk back to the original example\n        -offset_mapping: character-level positions of each token in the original context\n\n    2)add_answer_positions(...)\n    Converts the answer's character-level start and end positions\n    into token-level start and end indices,\n    and adds them to the tokenized data as start_positions and end_positions.\n\n"

In [14]:
def tokenize_examples(examples):
    # Tokenize the "question" and "context" fields from the examples
    tokenized_examples = tokenizer(
        examples["question"],         # List of questions
        examples["context"],          # Corresponding list of contexts
        truncation="only_second",     # Only truncate the context if it's too long
        max_length=384,               # Limit total token length to 384
        stride=128,                   # Overlap between chunks to avoid cutting off answers
        return_overflowing_tokens=True,  # Return multiple chunks if context is too long
        return_offsets_mapping=True,     # Return mapping of tokens to original character positions
        padding="max_length"             # Pad sequences to max length
    )

    # Extract the mapping from each tokenized chunk to its original example index
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # Extract the offset mapping (token-to-character span info)
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Return the tokenized input plus sample/offset mapping for later label alignment
    return tokenized_examples, sample_mapping, offset_mapping


In [15]:
def add_answer_positions(tokenized_examples, sample_mapping, offset_mapping, examples):
    # Initialize lists to store start and end token indices for each example
    start_positions = []
    end_positions = []

    # Iterate through each tokenized chunk
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]  # Get token IDs for this chunk
        cls_index = input_ids.index(tokenizer.cls_token_id)  # Get the index of the [CLS] token

        sequence_ids = tokenized_examples.sequence_ids(i)  # Sequence IDs: 0=question, 1=context
        sample_index = sample_mapping[i]  # Map this chunk back to the original example
        answers = examples["answers"][sample_index]  # Get the answer(s) for this example

        # If there is no answer (impossible question)
        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)  # Use CLS index as placeholder
            end_positions.append(cls_index)
        else:
            # Get character-level start/end of the first answer
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Find the first token index that belongs to the context
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # Find the last token index that belongs to the context
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Check if the answer is fully within the current chunk
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                # If not fully contained, use CLS token as default
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                # Narrow down the token span to exact start position
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)

                # Narrow down the token span to exact end position
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    # Add the computed positions to the tokenized examples
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    # Return tokenized examples with labels attached
    return tokenized_examples


In [16]:
# 1. Tokenize the training dataset and get mappings
tokenized_examples, sample_mapping, offset_mapping = tokenize_examples(raw_dataset["train"])

# 2. Add start/end position labels to the tokenized examples
processed_data = add_answer_positions(tokenized_examples, sample_mapping, offset_mapping, raw_dataset["train"])


KeyboardInterrupt: 

In [None]:
# For validation data
val_tokenized, val_sample_map, val_offset_map = tokenize_examples(raw_dataset["validation"])
val_processed = add_answer_positions(val_tokenized, val_sample_map, val_offset_map, raw_dataset["validation"])

In [None]:
# Apply the preprocessing function to the entire dataset
# - Tokenizes questions and contexts with proper truncation and padding
# - Maps character-level answer positions to token-level start and end positions
# - Handles overlapping chunks for long contexts
# - Removes original columns to keep only processed features for training
tokenized_dataset = dataset.map(
    prepare_features_with_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)


In [None]:
# 6. Set training arguments such as output directory, batch sizes, epochs, and logging details
training_args = TrainingArguments(
    output_dir="./lora_qa",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_steps=100,
    eval_steps=1000,
    save_strategy="epoch",
    fp16=True,
    evaluation_strategy="steps",
)


In [None]:
from transformers import Trainer, default_data_collator

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"].select(range(1000)),
    eval_dataset=tokenized["validation"].select(range(500)),
    tokenizer=tokenizer,
    data_collator=default_data_collator,
)

trainer.train()  # 학습 시작


In [None]:
#save model

In [None]:
trainer.save_model("./lora_qa_model")
tokenizer.save_pretrained("./lora_qa_model")

In [None]:
#Evaluation

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
#prediction/Inference

In [None]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
result = qa_pipeline(question="What is LoRA?", context="LoRA is ...")
print(result)
