In [1]:
!nvidia-smi

Tue May  3 12:38:14 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import os
# your workspace in your drive
workspace = 'ML2022-hw7'

try:
  os.chdir(os.path.join('/content/gdrive/MyDrive/', workspace))
except:
  os.makedirs(os.path.join('/content/gdrive/MyDrive/', workspace), exist_ok=True)
  os.chdir(os.path.join('/content/gdrive/MyDrive/', workspace))

In [4]:
# You are allowed to change version of transformers or use other toolkits
!pip install transformers==4.5.0

Collecting transformers==4.5.0
  Downloading transformers-4.5.0-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 14.4 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 76.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 91.2 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=4bd5b0ab3bf169e944b32b00237d9a9c9724a6d7bcad45e01e645a6cc126c7c9
  Stored in directory: /root/.cache/pip/wheels/87/39/dd/a83eeef36d0bf98e7a4d1933a4ad2d660295a40613079bafc9
Successfully built sacremoses
Installing collected packages: tokenizers, sacremoses, transforme

In [5]:
import json
import numpy as np
import random
import torch
from torch.utils.data import DataLoader, Dataset 
from transformers import AdamW, BertForQuestionAnswering, BertTokenizerFast, get_linear_schedule_with_warmup

from tqdm.auto import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

# Fix random seed for reproducibility
def same_seeds(seed):
	  torch.manual_seed(seed)
	  if torch.cuda.is_available():
		    torch.cuda.manual_seed(seed)
		    torch.cuda.manual_seed_all(seed)
	  np.random.seed(seed)
	  random.seed(seed)
	  torch.backends.cudnn.benchmark = False
	  torch.backends.cudnn.deterministic = True
same_seeds(24984)

# Change "fp16_training" to True to support automatic mixed precision training (fp16)	
fp16_training = True

if fp16_training:
    !pip install accelerate==0.2.0
    from accelerate import Accelerator
    accelerator = Accelerator(fp16=True)
    device = accelerator.device

# Documentation for the toolkit:  https://huggingface.co/docs/accelerate/

Collecting accelerate==0.2.0
  Downloading accelerate-0.2.0-py3-none-any.whl (47 kB)
[?25l[K     |███████                         | 10 kB 38.7 MB/s eta 0:00:01[K     |█████████████▉                  | 20 kB 40.1 MB/s eta 0:00:01[K     |████████████████████▉           | 30 kB 22.2 MB/s eta 0:00:01[K     |███████████████████████████▊    | 40 kB 18.2 MB/s eta 0:00:01[K     |████████████████████████████████| 47 kB 4.3 MB/s 
[?25hCollecting pyaml>=20.4.0
  Downloading pyaml-21.10.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: pyaml, accelerate
Successfully installed accelerate-0.2.0 pyaml-21.10.1


In [6]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [7]:
model1 = BertForQuestionAnswering.from_pretrained("saved_model/doc_stride").to(device)
model2 = BertForQuestionAnswering.from_pretrained("saved_model/202205doc").to(device)
model3 = BertForQuestionAnswering.from_pretrained("saved_model/doc_s_imp" ).to(device)
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-large")

Downloading:   0%|          | 0.00/660 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/269k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

In [8]:
def read_data(file):
    with open(file, 'r', encoding="utf-8") as reader:
        data = json.load(reader)
    return data["questions"], data["paragraphs"]

test_questions, test_paragraphs = read_data("hw7_test.json")

In [9]:
test_questions_tokenized = tokenizer([test_question["question_text"] for test_question in test_questions], add_special_tokens=False) 
test_paragraphs_tokenized = tokenizer(test_paragraphs, add_special_tokens=False)

In [10]:
cnt = 0

for i in range(len(test_questions)):
    for id, pos in zip(test_questions_tokenized[i].ids,test_questions_tokenized[i].offsets):
        if id != 100: continue
        tokenizer.add_tokens(test_questions[i]["question_text"][pos[0]:pos[1]])
        cnt += 1
    
print(cnt)

41


In [11]:
for i in range(len(test_paragraphs)):
    for id, pos in zip(test_paragraphs_tokenized[i].ids, test_paragraphs_tokenized[i].offsets):
        if id != 100: continue
        tokenizer.add_tokens(test_paragraphs[i][pos[0]:pos[1]])
        cnt += 1

In [12]:
model1.resize_token_embeddings(len(tokenizer))
model2.resize_token_embeddings(len(tokenizer))
model3.resize_token_embeddings(len(tokenizer))

Embedding(21357, 1024)

In [13]:
def match(answer, left, right):
    answer = answer.replace(' ','')
    left_cnt = answer.count(left)
    right_cnt = answer.count(right)
    if left_cnt  == 1 and right_cnt == 0:
        return answer + right
    elif right_cnt == 1 and left_cnt == 0:
        return left + answer
    else:
        return answer

In [14]:
def evaluate(data, output1, output2, output3):
    ##### TODO: Postprocessing #####
    # There is a bug and room for improvement in postprocessing 
    # Hint: Open your prediction file to see what is wrong 
    
    answer = ''
    max_prob = float('-inf')
    num_of_windows = data[0].shape[1]

    output_start = (output1.start_logits + output2.start_logits + output3.start_logits ) / 3
    ouput_end = (output1.end_logits + output2.end_logits + output3.start_logits ) / 3
    
    for k in range(num_of_windows):
        # Obtain answer by choosing the most probable start position / end position
            
        start_probs, start_indexs = torch.topk(output_start[k], 10, dim=0)
        end_probs, end_indexs = torch.topk(ouput_end [k], 10, dim=0)

        for i in range(10):
            for j in range(10):
                start_prob, start_index = start_probs[i], start_indexs[i]
                end_prob, end_index = end_probs[i], end_indexs[i]

                prob = (start_prob + end_prob) * 50 - abs(start_prob - end_prob) * 10
                tmp = tokenizer.decode(data[0][0][k][start_index : end_index + 1])
                if prob < max_prob:
                    continue
                if start_index > end_index or '[CLS]' in tmp:
                    continue
                max_prob = prob
                answer = tmp

    
    # Remove spaces in answer (e.g. "大 金" --> "大金")
    answer = match(answer, '「', '」')
    answer = match(answer, '『', '』')
    answer = match(answer, '《 ', '》')
    return answer.replace(' ','')

In [15]:
class QA_Dataset(Dataset):
    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):
        self.split = split
        self.questions = questions
        self.tokenized_questions = tokenized_questions
        self.tokenized_paragraphs = tokenized_paragraphs
        self.max_question_len = 50
        self.max_paragraph_len = 350
        
        ##### TODO: Change value of doc_stride #####
        self.doc_stride = 300

        # Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]
        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        tokenized_question = self.tokenized_questions[idx]
        tokenized_paragraph = self.tokenized_paragraphs[question["paragraph_id"]]

        ##### TODO: Preprocessing #####
        # Hint: How to prevent model from learning something it should not learn

        if self.split == "train":
            # Convert answer's start/end positions in paragraph_text to start/end positions in tokenized_paragraph  
            answer_start_token = tokenized_paragraph.char_to_token(question["answer_start"])
            answer_end_token = tokenized_paragraph.char_to_token(question["answer_end"])

            # A single window is obtained by slicing the portion of paragraph containing the answer
            mid = (answer_start_token + answer_end_token) // 2
            paragraph_start = max(0, min( int(mid - self.max_paragraph_len * random.random()) , len(tokenized_paragraph) - self.max_paragraph_len))
            paragraph_end = paragraph_start + self.max_paragraph_len
            
            # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102] 
            input_ids_paragraph = tokenized_paragraph.ids[paragraph_start : paragraph_end] + [102]		
            
            # Convert answer's start/end positions in tokenized_paragraph to start/end positions in the window  
            answer_start_token += len(input_ids_question) - paragraph_start
            answer_end_token += len(input_ids_question) - paragraph_start
            
            # Pad sequence and obtain inputs to model 
            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token

        # Validation/Testing
        else:
            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []
            
            # Paragraph is split into several windows, each with start positions separated by step "doc_stride"
            for i in range(0, len(tokenized_paragraph), self.doc_stride):
                
                # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
                input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]
                
                # Pad sequence and obtain inputs to model
                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
                
                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type_ids)
                attention_mask_list.append(attention_mask)
            
            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list)

    def padding(self, input_ids_question, input_ids_paragraph):
        # Pad zeros if sequence length is shorter than max_seq_len
        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
        # Indices of input sequence tokens in the vocabulary
        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
        # Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
        # Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len
        
        return input_ids, token_type_ids, attention_mask

test_set = QA_Dataset("test", test_questions, test_questions_tokenized, test_paragraphs_tokenized)

train_batch_size = 8

# Note: Do NOT change batch size of dev_loader / test_loader !
# Although batch size=1, it is actually a batch consisting of several windows from the same QA pair

test_loader = DataLoader(test_set, batch_size=1, shuffle=False, pin_memory=True)

In [None]:
print("Evaluating Test Set ...")

result = []

model1.eval()
model2.eval()
model3.eval()
with torch.no_grad():
    for data in tqdm(test_loader):
        output1 = model1(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
        output2 = model2(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
        output3 = model3(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
        result.append(evaluate(data, output1, output2, output3))

result_file = "ensemble_result.csv"
with open(result_file, 'w') as f:	
	  f.write("ID,Answer\n")
	  for i, test_question in enumerate(test_questions):
        # Replace commas in answers with empty strings (since csv is separated by comma)
        # Answers in kaggle are processed in the same way
		    f.write(f"{test_question['id']},{result[i].replace(',','')}\n")

print(f"Completed! Result is in {result_file}")

Evaluating Test Set ...


  0%|          | 0/4957 [00:00<?, ?it/s]