# install
```bash
pyenv shell 3.8.10
python -m venv venv-trans
source venv-trans/bin/activate
pip install --upgrade pip
pip install jupyter


#pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html

#pip install torch==1.8.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.10.2 --extra-index-url https://download.pytorch.org/whl/cu111

pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 --extra-index-url https://download.pytorch.org/whl/cu111

pip install pytorch-transformers
```
Install apex
```bash
git clone https://github.com/NVIDIA/apex
cd apex
#pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
# https://github.com/NVIDIA/apex/issues/633
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" ./
```

# Train

In [None]:
!wget 'https://data.deepai.org/squad1.1.zip'

In [1]:
# download SQuAD 2.0
!wget 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json'
# download SQuAD 1.1
!wget 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json'
!wget 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json'

--2022-06-05 22:47:03--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.111.153, 185.199.110.153, 185.199.108.153, ...
접속 rajpurkar.github.io (rajpurkar.github.io)|185.199.111.153|:443... 접속됨.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘train-v2.0.json’


2022-06-05 22:47:06 (20.7 MB/s) - ‘train-v2.0.json’ saved [42123633/42123633]



In [7]:
!wget -nc 'https://raw.githubusercontent.com/nlpyang/pytorch-transformers/master/examples/utils_squad.py'
!wget -nc 'https://raw.githubusercontent.com/nlpyang/pytorch-transformers/master/examples/utils_squad_evaluate.py'

File ‘utils_squad.py’ already there; not retrieving.

File ‘utils_squad_evaluate.py’ already there; not retrieving.



In [23]:
import time

import os
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler

#from apex.optimizers import FP16_Optimizer, FusedAdam
from apex.contrib.optimizers import FP16_Optimizer, FusedAdam

from pytorch_transformers import BertForQuestionAnswering, BertTokenizer

from utils_squad import (read_squad_examples, convert_examples_to_features)

import random
import numpy as np

In [14]:
num_train_epochs = 2
train_batch_size = 32

#SQUAD_DIR = '/root/BERT/SQuAD1'
SQUAD_DIR = './BERT/SQuAD1'
#OUTPUT_DIR = '/root/BERT/output'
OUTPUT_DIR = './BERT/output'

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [15]:
# 1. Load the training data from JSON
train_file = SQUAD_DIR + '/train-v1.1.json'
train_examples = read_squad_examples(train_file, is_training = True, 
                                     version_2_with_negative = False)

In [17]:
# 2. Tokenize the training data
#tokenizer = BertTokenizer(vocab_file="bert-base-uncased-vocab.txt")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_features = convert_examples_to_features(train_examples, tokenizer, 
                                              max_seq_length=384, doc_stride=128, 
                                              max_query_length=64, is_training=True)


100%|████████████████████████████████████████████████████████████████| 231508/231508 [00:00<00:00, 395010.55B/s]


In [19]:
# 3. Get the tokenized data ready for training the model
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                   all_start_positions, all_end_positions)

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)


In [21]:
# 4. Initialize the BERT-based model for Question Answering
#    Using half-precision (FP16) for the model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
model.half()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

100%|███████████████████████████████████████████████████████████████████████| 433/433 [00:00<00:00, 99996.35B/s]
100%|█████████████████████████████████████████████████████████| 440473133/440473133 [03:18<00:00, 2216315.58B/s]


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): FusedLayerNorm(torch.Size

In [24]:
# 5. Prepare the optimizer (using mixed precision)
param_optimizer = list(model.named_parameters())
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=3e-5,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)



This fp16_optimizer is designed to only work with apex.contrib.optimizers.*
To update, use updated optimizers with AMP.


In [25]:
# 6. Train the model
model.train()

start_time = time.time()

for epoch in range(num_train_epochs):
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch) 
        input_ids, input_mask, segment_ids, start_positions, end_positions = batch
        outputs = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
        loss = outputs[0]  

        loss.backward()

        optimizer.step()
        optimizer.zero_grad()
     
    if epoch==0:
        print("Time it took to complete the first training epoch: ", (time.time()-start_time))
    print("Loss after epoch ", epoch, ": ", loss.item())

Time it took to complete the first training epoch:  639.8684318065643
Loss after epoch  0 :  0.39306640625
Loss after epoch  1 :  0.4365234375


In [26]:
# 7. Save the trained model to OUTPUT_DIR 
#    (Create the directory if it does not exist; otherwise override the contents)
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(OUTPUT_DIR)

In [27]:
import os

import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)

from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions

from pytorch_transformers import BertForQuestionAnswering, BertTokenizer

In [28]:
# 1. Load a trained model

#OUTPUT_DIR = '/root/BERT/test_output'
OUTPUT_DIR = './BERT/output'
model = BertForQuestionAnswering.from_pretrained(OUTPUT_DIR)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
model.to(device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): FusedLayerNorm(torch.Size

In [30]:
# 2. Load and pre-process the test set

dev_file = "./BERT/SQuAD1/dev-v1.1.json"
predict_batch_size = 32

eval_examples = read_squad_examples(input_file=dev_file, is_training=False, version_2_with_negative=False)

#tokenizer = BertTokenizer(vocab_file="bert-base-uncased-vocab.txt")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=384,
            doc_stride=128,
            max_query_length=64,
            is_training=False)

all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)

eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=predict_batch_size)

In [31]:
# 3. Run inference on the test set

model.eval()
all_results = []
for input_ids, input_mask, segment_ids, example_indices in eval_dataloader:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    with torch.no_grad():
        batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
    for i, example_index in enumerate(example_indices):
        start_logits = batch_start_logits[i].detach().cpu().tolist()
        end_logits = batch_end_logits[i].detach().cpu().tolist()
        eval_feature = eval_features[example_index.item()]
        unique_id = int(eval_feature.unique_id)
        all_results.append(RawResult(unique_id=unique_id,
                                             start_logits=start_logits,
                                             end_logits=end_logits))
        
output_prediction_file = os.path.join(OUTPUT_DIR, "predictions.json")
output_nbest_file = os.path.join(OUTPUT_DIR, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(OUTPUT_DIR, "null_odds.json")

preds = write_predictions(eval_examples, eval_features, all_results, 20,
                      30, True, output_prediction_file,
                      output_nbest_file, output_null_log_odds_file, True,
                      False, 0.0)

In [32]:
!python evaluate-v1.1.py ./BERT/SQuAD1/dev-v1.1.json ./BERT/output/predictions.json

{"exact_match": 78.6092715231788, "f1": 86.61794924271041}


# 분석

`train_examples` from `read_squad_examples`

In [35]:
len(train_examples)

87599

In [37]:
train0 = train_examples[0]

In [41]:
type(train0)

utils_squad.SquadExample

In [47]:
print(train0.qas_id)
print(' '.join(train0.doc_tokens))
print(train0.question_text)
print(train0.orig_answer_text)
print(train0.start_position)
print(train0.end_position)
print(train0.is_impossible)

5733be284776f41900661182
Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Saint Bernadette Soubirous
90
92
False


In [48]:
len(train_features)

88641

In [53]:
len(train_features)-len(train_examples)

1042

In [54]:
train_features[88640].unique_id

1000088640

In [56]:
t_f2 = convert_examples_to_features(train_examples, tokenizer, 
                                              max_seq_length=384, doc_stride=128, 
                                              max_query_length=64, is_training=True)