In [1]:
import os
import json
import re
import collections

In [2]:
data_path="/opt/ml/project/odqa/data/"
context_path= "wikipedia_documents.json"
with open(os.path.join(data_path, context_path), "r", encoding="utf-8") as f:
    wiki = json.load(f)

In [3]:
wiki_data = []

for key in wiki.keys() :
    data = wiki[key]
    doc_id = data['document_id']
    text = data['text']
    wiki_data.append(text)

In [4]:
import sys
import logging
import os
import sys
import re

import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from typing import List, Callable, NoReturn, NewType, Any
import dataclasses
from datasets import load_metric, load_from_disk, Dataset, DatasetDict

from transformers import AutoConfig, AutoModel, AutoModelForQuestionAnswering, AutoTokenizer, BertTokenizer, BertTokenizerFast
from transformers import (
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    TrainingArguments,
    set_seed,
)

In [5]:
sys.path.append('../')

In [6]:
from arguments import (
    ModelArguments,
    DataTrainingArguments,
)

In [7]:
model_args = ModelArguments

config = AutoConfig.from_pretrained(
    model_args.config_name 
    if model_args.config_name is not None
    else model_args.model_name_or_path,
)
print(config)

RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.11.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32000
}





In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [9]:
data_args = DataTrainingArguments

In [10]:
bert_model = AutoModel.from_pretrained('klue/roberta-large')

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

In [11]:
tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name
    if model_args.tokenizer_name is not None
    else model_args.model_name_or_path,
    use_fast=True,
)

In [12]:
print('Tokenizer Size : %d' %len(tokenizer))
print(tokenizer)

Tokenizer Size : 32000
PreTrainedTokenizerFast(name_or_path='klue/roberta-large', vocab_size=32000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [13]:
tokenizer_output = tokenizer(wiki_data[0], return_tensors='pt', padding='max_length',max_length=384)
tokenizer_output = {key:tokenizer_output[key] for key in tokenizer_output }

model_input = {'input_ids' : tokenizer_output['input_ids'], 
    'attention_mask' : tokenizer_output['attention_mask'], 
    'token_type_ids' : tokenizer_output['token_type_ids']}

In [14]:
model_output = bert_model(**model_input)[0]
print('Model output shape : {}'.format(model_output.shape))

Model output shape : torch.Size([1, 384, 1024])


## CNN Head

In [15]:
class ConvLayer(nn.Module) :
    def __init__(self, seq_size, feature_size, intermediate_size) :
        super(ConvLayer, self).__init__()
        self.conv_layer = nn.Sequential(nn.Conv1d(seq_size, intermediate_size, 5, padding=2),
            nn.Conv1d(intermediate_size, seq_size, 1),
            nn.ReLU())
        self.layer_norm = nn.LayerNorm(feature_size, eps=1e-6)
            
    def forward(self, x) :
        y = x + self.conv_layer(x)
        y = self.layer_norm(y)
        return y

In [16]:
class ConvNet(nn.Module) :
    def __init__(self, layer_size, seq_size, feature_size, intermediate_size) :
        super(ConvNet, self).__init__()
        conv_net = [ConvLayer(seq_size, feature_size, intermediate_size) for i in range(layer_size)]
        self.conv_net = nn.Sequential(*conv_net)
        self.init_weights()

    def init_weights(self) :
        for p in self.parameters() :
            if p.requires_grad == True and p.dim() > 1:
                nn.init.kaiming_uniform_(p)

    def forward(self, x) :
        y = self.conv_net(x)
        return y

In [17]:
sds_conv_layer = ConvLayer(seq_size=384, 
    feature_size=1024,
    intermediate_size=512)

cnn_layer_output = sds_conv_layer(model_output)
print(cnn_layer_output.shape)

torch.Size([1, 384, 1024])


In [18]:
sds_conv_net = ConvNet(layer_size=3,
    seq_size=384, 
    feature_size=1024,
    intermediate_size=512)

cnn_output = sds_conv_net(model_output)
print(cnn_output.shape)

torch.Size([1, 384, 1024])


## LSTM Head

In [19]:
model_output.shape

torch.Size([1, 384, 1024])

In [33]:
class LSTMHead(nn.Module) :
    def __init__(self, layer_size, feature_size, intermediate_size) :
        super(LSTMHead, self).__init__()
        self.layer_size = layer_size
        self.feature_size = feature_size
        self.intermediate_size = intermediate_size

        self.lstm = torch.nn.LSTM(input_size=feature_size,
            hidden_size = intermediate_size,
            num_layers=layer_size,
            batch_first=True,
            dropout=0.1,
            bidirectional=True
        )
        self.init_weights()

    def init_weights(self) :
        for p in self.parameters() :
            if p.requires_grad == True and p.dim() > 1:
                nn.init.kaiming_uniform_(p)

    def forward(self, x) :
        print(x.shape)
        batch_size = x.shape[0]
        h_input = torch.zeros((2*self.layer_size, batch_size, self.intermediate_size))
        c_input = torch.zeros((2*self.layer_size, batch_size, self.intermediate_size))
        y, (h_output, c_output) = self.lstm(x, (h_input,c_input))
        return y

In [36]:
lstm_head = LSTMHead(3, 1024, 512)

In [37]:
lstm_output = lstm_head(model_output)
lstm_output.shape

torch.Size([1, 384, 1024])


torch.Size([1, 384, 1024])

## Check Model

In [36]:
from transformers import BertPreTrainedModel, BertModel, RobertaModel, RobertaPreTrainedModel
from transformers.modeling_outputs import QuestionAnsweringModelOutput

In [42]:
class SDSNetForQuestionAnswering(RobertaPreTrainedModel):
    def __init__(self, model_name, data_args, config):
        super(SDSNetForQuestionAnswering, self).__init__(config)
        self.num_labels = config.num_labels
        self.bert = RobertaModel.from_pretrained(model_name, 
            config=config, 
            add_pooling_layer=False
        )

        self.cnn_head = ConvNet(layer_size=data_args.cnn_layer_size, 
            seq_size=data_args.max_seq_length,
            feature_size=config.hidden_size,
            intermediate_size=data_args.cnn_intermediate_size)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        start_positions=None,
        end_positions=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0] # (batch_size, seq_size, hidden_size) : [CLS] Token
        sequence_output = self.cnn_head(sequence_output)

        logits = self.qa_outputs(sequence_output) # (batch_size, seq_size, label_size=2)
        start_logits, end_logits = logits.split(1, dim=-1)  
        start_logits = start_logits.squeeze(-1).contiguous() # (batch_size, seq_size) 
        end_logits = end_logits.squeeze(-1).contiguous() # (batch_size, seq_size)

        total_loss = None
        # start_positions : (batch_size, )
        # end_positions : (batch_size, )
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # make answer token logits bigger, find answer position
            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) 
            start_loss = loss_fct(start_logits, start_positions) 
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [44]:
sds_qa = SDSNetForQuestionAnswering(model_name=model_args.model_name_or_path,
    data_args=data_args,
    config=config)

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [45]:
sds_qa_output = sds_qa(**model_input)

In [46]:
print('Start Logit Shape : {}'.format(sds_qa_output['start_logits'].shape))
print('End Logit Shape : {}'.format(sds_qa_output['end_logits'].shape))

Start Logit Shape : torch.Size([1, 384])
End Logit Shape : torch.Size([1, 384])
