In [15]:
from datasets import load_dataset

import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from transformers import default_data_collator

raw_datasets = load_dataset("squad")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [16]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased" # distilbert-base-cased-distilled-squad
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [17]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [18]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

['To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'What is in front of the Notre Dame Main Building?', 'The Basilica of the Sacred heart at Notre Dame is beside to which structure?', 'What is the Grotto at Notre Dame?', 'What sits on top of the Main Building at Notre Dame?', 'When did the Scholastic Magazine of Notre dame begin publishing?', "How often is Notre Dame's the Juggler published?", 'What is the daily student paper at Notre Dame called?', 'How many student news papers are found at Notre Dame?', 'In what year did the student paper Common Sense begin publication at Notre Dame?', 'Where is the headquarters of the Congregation of the Holy Cross?', 'What is the primary seminary of the Congregation of the Holy Cross?', 'What is the oldest structure at Notre Dame?', 'What individuals live at Fatima House at Notre Dame?', 'Which prize did Frederick Buechner create?', 'How many BS level degrees are offered in the College of Engineering at Notre Dame?', 'In wh

KeyboardInterrupt: 

In [5]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [6]:
"""Hugging Face의 evaluate method 구현"""
from tqdm.auto import tqdm
import collections
import numpy as np
import evaluate

metric = evaluate.load("squad")
n_best = 20
max_answer_length = 30
predicted_answers = []

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)



In [7]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

(10570, 10784)

In [8]:
train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")

def make_dataloader(train_dataset, validation_set, batch_size):
    train_dataloader = DataLoader(
        train_dataset,
        shuffle=True,
        collate_fn=default_data_collator,
        batch_size=batch_size,
    )
    eval_dataloader = DataLoader(
        validation_set, collate_fn=default_data_collator, batch_size=batch_size
    )
    return train_dataloader, eval_dataloader

In [9]:
device = "cuda:3" if torch.cuda.is_available() else "cpu"
device

'cuda:3'

In [10]:
import torch
from tqdm.auto import tqdm

def train(n_epoch, model, optimizer, train_dataloader, eval_dataloader):
    print("TRAIN START...")
    
    for epoch in tqdm(range(n_epoch), desc='train'):
        # Training
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch['input_ids'],batch['attention_mask'], batch['start_positions'],batch['end_positions'] = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['start_positions'].to(device), batch['end_positions'].to(device)
            outputs = model(**batch)
            loss = outputs.loss
            
            loss.backward(loss)
            optimizer.step()
            optimizer.zero_grad()
           
    
    # Evaluation
    model.eval()
    start_logits = []
    end_logits = []

    print("EVAL START...")
    
    for batch in tqdm(eval_dataloader, desc='eval'):
        with torch.no_grad():
            batch['input_ids'], batch['attention_mask'] = batch['input_ids'].to(device), batch['attention_mask'].to(device)
            outputs = model(**batch)

        start_logits.append(outputs.start_logits.cpu().numpy())
        end_logits.append(outputs.end_logits.cpu().numpy())

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(validation_dataset)]
    end_logits = end_logits[: len(validation_dataset)]

    metrics = compute_metrics(
        start_logits, end_logits, validation_dataset, raw_datasets["validation"]
    )
    
    score = int(metrics['exact_match']) + int(metrics['f1'])
    print('EM: ',int(metrics['exact_match']),'\nF1: ',int(metrics['f1']))
    return score
        

In [11]:
#model train
def objective(trial):
    cfg = { 
        'learning_rate': trial.suggest_loguniform('learning_rate', 2e-5, 2e-4),
        'optimizer': trial.suggest_categorical('optimizer', [optim.SGD, optim.Adam, optim.AdamW]),
        'batch_size': 32,
        'n_epoch': trial.suggest_int('n_epoch',3,5,1),
        'seed': 1337
    }
    torch.manual_seed(cfg['seed'])
    
    from transformers import AutoModelForQuestionAnswering

    model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
    train_dataloader, eval_dataloader = make_dataloader(train_dataset, validation_set, cfg['batch_size'])
    
    optimizer = cfg['optimizer'](model.parameters(), lr=cfg['learning_rate'])
    
    model = model.to(device)
    
    score = train(cfg['n_epoch'], model, optimizer, train_dataloader, eval_dataloader)

    return score
    

In [12]:
sampler = optuna.samplers.TPESampler()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

[I 2023-11-07 11:03:13,361] A new study created in memory with name: no-name-45f47916-f5d0-48e3-84d5-08c3410c93b2
  'learning_rate': trial.suggest_loguniform('learning_rate', 2e-5, 2e-4),
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TRAIN START...


train:   0%|          | 0/6 [00:00<?, ?it/s]

EVAL START...


eval:   0%|          | 0/337 [00:00<?, ?it/s]

  0%|          | 0/10570 [00:00<?, ?it/s]

[I 2023-11-07 12:36:28,698] Trial 0 finished with value: 156.0 and parameters: {'learning_rate': 4.977906518714672e-05, 'optimizer': <class 'torch.optim.adam.Adam'>, 'n_epoch': 6}. Best is trial 0 with value: 156.0.


156


  'learning_rate': trial.suggest_loguniform('learning_rate', 2e-5, 2e-4),
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TRAIN START...


train:   0%|          | 0/6 [00:00<?, ?it/s]

EVAL START...


eval:   0%|          | 0/337 [00:00<?, ?it/s]

  0%|          | 0/10570 [00:00<?, ?it/s]

[I 2023-11-07 14:09:49,314] Trial 1 finished with value: 150.0 and parameters: {'learning_rate': 8.625513136142732e-05, 'optimizer': <class 'torch.optim.adamw.AdamW'>, 'n_epoch': 6}. Best is trial 0 with value: 156.0.


150


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TRAIN START...


train:   0%|          | 0/6 [00:00<?, ?it/s]

EVAL START...


eval:   0%|          | 0/337 [00:00<?, ?it/s]

  0%|          | 0/10570 [00:00<?, ?it/s]

[I 2023-11-07 15:42:47,217] Trial 2 finished with value: 159.0 and parameters: {'learning_rate': 2.5286688647809423e-05, 'optimizer': <class 'torch.optim.adam.Adam'>, 'n_epoch': 6}. Best is trial 2 with value: 159.0.


159


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TRAIN START...


train:   0%|          | 0/7 [00:00<?, ?it/s]

EVAL START...


eval:   0%|          | 0/337 [00:00<?, ?it/s]

  0%|          | 0/10570 [00:00<?, ?it/s]

[I 2023-11-07 17:31:06,611] Trial 3 finished with value: 137.0 and parameters: {'learning_rate': 0.00016728541307600047, 'optimizer': <class 'torch.optim.adam.Adam'>, 'n_epoch': 7}. Best is trial 2 with value: 159.0.


137


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TRAIN START...


train:   0%|          | 0/5 [00:00<?, ?it/s]

EVAL START...


eval:   0%|          | 0/337 [00:00<?, ?it/s]

  0%|          | 0/10570 [00:00<?, ?it/s]

[I 2023-11-07 18:47:38,132] Trial 4 finished with value: 136.0 and parameters: {'learning_rate': 0.00018778065439645835, 'optimizer': <class 'torch.optim.sgd.SGD'>, 'n_epoch': 5}. Best is trial 2 with value: 159.0.


136


In [14]:
# objective의 return값이 가장 "maximize"된 값
# (optuna.create_study의 direction)
print(f'study.best_trial.value : {study.best_trial.value}') 
# objective의 return값이 가장 "maximize"된 파라미터 값
print(f'study.best_params : {study.best_params}')
# 모든 시도 출력(FrozenTrail의 리스트)
print(f'study.trials : {study.trials}')
# 시각화 ( Empirical Distribution Function Plot, EDF, 경험적 분포 함수)
# optuna.visualization.plot_edf(study)

study.best_trial.value : 159.0
study.best_params : {'learning_rate': 2.5286688647809423e-05, 'optimizer': <class 'torch.optim.adam.Adam'>, 'n_epoch': 6}


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [17]:
import joblib

joblib.dump(study, './result_optuna.pkl')

['./result_optuna.pkl']

In [2]:
import joblib

study = joblib.load('./result_optuna.pkl')
df = study.trials_dataframe()
df = df.drop(['datetime_start','datetime_complete','duration'], axis=1)
df.head(3)

Unnamed: 0,number,value,params_learning_rate,params_n_epoch,params_optimizer,state
0,0,156.0,5e-05,6,<class 'torch.optim.adam.Adam'>,COMPLETE
1,1,150.0,8.6e-05,6,<class 'torch.optim.adamw.AdamW'>,COMPLETE
2,2,159.0,2.5e-05,6,<class 'torch.optim.adam.Adam'>,COMPLETE


In [12]:
fig = optuna.visualization.plot_edf(study)
fig.show()


plot_edf is experimental (supported from v2.2.0). The interface can change in the future.



ImportError: Tried to import 'matplotlib' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'matplotlib'.

In [10]:
import optuna
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

# 최적화 기록 시각화 - x : trails, y : objective value
plot_optimization_history(study)
# trial의 학습 곡선 시각화 - x : step, y : intermediate value
plot_intermediate_values(study)
# 고차원 매개변수 관계 시각화 - x : 매개변수, y : objective value
plot_parallel_coordinate(study)	# 아래는 매개변수 선택
#plot_parallel_coordinate(study, params=["bagging_freq", "bagging_fraction"])
# 초매개변수 관계 시각화(매개변수끼리의 관계 시각화)
plot_contour(study)	# 아래는 매개변수 선택
#plot_parallel_coordinate(study, params=["bagging_freq", "bagging_fraction"])
# 개별 하이퍼파라미터를 슬라이스 플롯으로 시각화 - x : 파라미터, y : objective value, spot:trials
plot_slice(study)
#plot_slice(study, params=["bagging_freq", "bagging_fraction"])
# 매개변수 중요도를 시각화 - x : importance for objective value, y : hyperparameter
plot_param_importances(study)
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)	# 어떤 하이퍼파라미터가 trail duration에 영향을 미치는지 중요성 알아보기
# EDF(경험적 분포 함수) 시각화 - x : objective value, y : cumulative probability
plot_edf(study)

[W 2023-11-07 22:52:12,163] You need to set up the pruning feature to utilize `plot_intermediate_values()`
