In [1]:
import torch
# !pip install tensorboard
# !pip install ipywidgets widgetsnbextension pandas-profiling
import os
import math
import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, f1_score, classification_report, fbeta_score
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import copy
import re
import matplotlib.pyplot as plt 


In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
border = 0.02
print(border)

0.02


In [3]:
def get_texts(df):
    titles_and_abstracts = df['title_and_abstract'].values.tolist()
    return titles_and_abstracts

def get_labels(df):
    labels = df.iloc[:, 3].values
    return labels

def clean_text(text):

    if type(text) == float:
        text = ''
    #print(text)
    
    #text = text.lower()
    
    text = text.split()
    text = [x.strip() for x in text]
    text = [x.replace('\n', ' ').replace('\t', ' ') for x in text]
    text = ' '.join(text)
    text = re.sub('([.,!?()])', r' \1 ', text)
    #text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    
    
    text = preprocess(text)
    
    #remove stopwords
    #stop = stopwords.words('english')
    #text = " ".join([word for word in text.split() if word not in (stop)])
    
    return text

def preprocess(sentence):
    sentence = replace_double_quotation(sentence)
    sentence = replace_garbled_text(sentence)
    return sentence

def replace_garbled_text(sentence):
    garbled_char_table = {
        'Â©': '©', '–': '-', '‐': '-',
        'ﾂ｣': '£', 'ﾂｩ': '©', 'ﾂｫ': '«', 'ﾂｮ': '®', 'ﾂｰ': '°', 'ﾂｱ': '±', 'ﾂｲ': '²', 'ﾂｳ': '³', 'ﾂｴ': '´', 'ﾂｵ': 'µ', 'ﾂｷ': '·', 'ﾂｸ': '¸', 'ﾂｹ': '¹', 'ﾂｼ': '¼', 'ﾂｽ': '½', 'ﾂｾ': '¾', 'ﾂｿ': '¿', 'ﾂ': '',
        'ﾃｷ': '÷', 'ﾃｸ': 'ø', 'ﾃ': 'a', 'ﾃ｡': 'a', 'ﾃ｢': 'a', 'ﾃ｣': 'a', 'ﾃ､': 'a', 'ﾃ･': 'a', 'ﾃｦ': 'ae', 'ﾃｧ': 'c', 'ﾃｨ': 'e', 'ﾃｩ': 'e', 'ﾃｪ': 'e', 'ﾃｫ': 'e',
        'ﾃｬ': 'i', 'ﾃｭ': 'i', 'ﾃｮ': 'i', 'ﾃｯ': 'i', 'ﾃｱ': 'n', 'ﾃｲ': 'o', 'ﾃｳ': 'o', 'ﾃｴ': 'o', 'ﾃｵ': 'o', 'ﾃｶ': 'o', 'ﾃｹ': 'u', 'ﾃｺ': 'u', 'ﾃｻ': 'u', 'ﾃｼ': 'u', 'ﾃｽ': 'y', 'ﾃｿ': 'y', 'ﾃ': '×', 
        'ﾎｱ': 'α', 'ﾎｲ': 'β', 'ﾎｳ': 'γ', 'ﾎｴ': 'δ', 'ﾎｵ': 'ε', 'ﾎｶ': 'ζ', 'ﾎｷ': 'η', 'ﾎｸ': 'θ', 'ﾎｹ': 'ι', 'ﾎｺ': 'κ', 'ﾎｻ': 'λ', 'ﾎｼ': 'μ', 'ﾎｽ': 'ν', 'ﾎｾ': 'ξ', 'ﾎｿ': 'ο', 'ﾎ': '',
        'ﾏ': ' ',
        '竕､': '≤', '竕･': '≥', '竕ｦ': '≦', '竕ｧ': '≧',
        '窶｢': '•', '窶ｦ': '…', '窶ｲ': '′', '窶ｳ': '″', '窶ｴ': '‴', '窶': ' ',
        '竅ｰ': '⁰', '竅ｴ': '⁴', '竅ｵ': '⁵', '竅ｶ': '⁶', '竅ｷ': '⁷', '竅ｸ': '⁸', '竅ｹ': '⁹', '竅ｺ': '⁺', '竅ｻ': '⁻', '竅ｼ': '⁼', 
        '竏･': '∥', '竏ｪ': '∪', '竏ｫ': '∫', '竏ｶ': '∶', '竏ｼ': '∼', '竏': '', 
        'ﾂ\uf8f0': ' '
    }
    for garbled_char, valid_char in garbled_char_table.items():
        sentence = sentence.replace(garbled_char, valid_char)
    sentence = re.sub('[ぁ-んァ-ンｦ-ﾟ一-龥]', '', sentence)
    return sentence.translate(garbled_char_table)

def replace_double_quotation(sentence):
    return sentence.replace('"', "'")  

def clean_data(train_df):
    train_df.loc[train_df['abstract'].isnull(), 'abstract'] = train_df['title']
    train_df['abstract'] = train_df['abstract'].apply(clean_text)
    train_df['title'] = train_df['title'].apply(clean_text)
    return train_df

def concat(train_df):
    train_df['title_and_abstract'] = train_df['title'] + train_df['abstract']
    return train_df

train_df.loc[[2488,7708],['judgement']] = 0
train_df = clean_data(train_df)
test_df= clean_data(test_df)
train_df = concat(train_df)
test_df = concat(test_df)
train_df.head()

Unnamed: 0,id,title,abstract,judgement,title_and_abstract
0,0,One-year age changes in MRI brain volumes in o...,Longitudinal studies indicate that declines in...,0,One-year age changes in MRI brain volumes in o...
1,1,Supportive CSF biomarker evidence to enhance t...,The present study was undertaken to validate t...,0,Supportive CSF biomarker evidence to enhance t...
2,2,Occurrence of basal ganglia germ cell tumors w...,Objective: To report a case series in which ba...,0,Occurrence of basal ganglia germ cell tumors w...
3,3,New developments in diagnosis and therapy of C...,The etiology and pathogenesis of idiopathic ch...,0,New developments in diagnosis and therapy of C...
4,4,Prolonged shedding of SARS-CoV-2 in an elderly...,Prolonged shedding of SARS-CoV-2 in an elderly...,0,Prolonged shedding of SARS-CoV-2 in an elderly...


In [4]:
class Config:
    def __init__(self):
        super(Config, self).__init__()
        
        self.SEED = 42
        self.MODEL_PATH = 'allenai/scibert_scivocab_uncased'
        self.NUM_LABELS = 1
        
        self.TOKENIZER = AutoTokenizer.from_pretrained(self.MODEL_PATH)
        self.MAX_LENGTH = 512
        self.BATCH_SIZE = 32
        self.N_SPLIT = 5
        
        self.DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.FULL_FINETUNING = True
        self.LR = 2e-5
        self.OPTIMIZER = 'AdamW'
        self.CRITERION = 'BCEWithLogitsLoss'
        self.SAVE_BEST_ONLY = True
        self.N_VALIDATE_DUR_TRAIN = 1
        self.EPOCHS = 2 
        
config = Config()

In [5]:
def seed_init(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    
seed = config.SEED
seed_init(seed)

In [6]:
class TransformerDataset(Dataset):
    def __init__(self, df, indices, set_type=None):
        super(TransformerDataset, self).__init__()

        df = df.iloc[indices]
        self.titles_and_abstracts = get_texts(df)
        self.set_type = set_type
        if self.set_type != 'test':
            self.labels = get_labels(df)

        self.max_length = config.MAX_LENGTH
        self.tokenizer = config.TOKENIZER

    def __len__(self):
        return len(self.titles_and_abstracts)
    
    def __getitem__(self, index):
        tokenized_titles_and_abstracts = self.tokenizer.encode_plus(
            self.titles_and_abstracts[index], 
            max_length=self.max_length,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        input_ids_titles_and_abstracts = tokenized_titles_and_abstracts['input_ids'].squeeze()
        attention_mask_titles_and_abstracts = tokenized_titles_and_abstracts['attention_mask'].squeeze()
        

        if self.set_type != 'test':
            return {
                'titles_and_abstracts': {
                    'input_ids': input_ids_titles_and_abstracts.long(),
                    'attention_mask': attention_mask_titles_and_abstracts.long(),
                },
                'labels': torch.Tensor([self.labels[index]]).float(),
            }

        return {
            'titles_and_abstracts': {
                'input_ids': input_ids_titles_abstracts.long(),
                'attention_mask': attention_mask_titles_and_abstracts.long(),
            }
        }

In [7]:
class PubMedBert(nn.Module):
    def __init__(self,path):
        super(PubMedBert, self).__init__()
        self.model = AutoModel.from_pretrained(path)
        self.dropout = nn.Dropout(0.25)
        self.avgpool = nn.AvgPool1d(2, 2)
        self.linear = nn.Linear(768, config.NUM_LABELS)
    
    def forward(self, input_ids_titles_and_abstracts, attention_mask_titles_and_abstracts=None):
        output = self.model(input_ids=input_ids_titles_and_abstracts, attention_mask=attention_mask_titles_and_abstracts)
        features = output.pooler_output
        features = features.unsqueeze(1)
        features_pooled = self.avgpool(features)
        features_pooled = features_pooled.squeeze(1)

        x = self.dropout(features)
        x = self.linear(x)
        
        return x

In [22]:
def val(model,model1,model2, val_dataloader, criterion, epoch, model_number):
    val_loss = 0
    true, pred,  pred1,  pred2, output = [], [], [], [], []
    
    model.eval()
    
    for step, batch in tqdm(enumerate(val_dataloader), total=len(val_dataloader)):
        b_input_ids = batch['titles_and_abstracts']['input_ids'].to(device)
        b_attention_mask = batch['titles_and_abstracts']['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        
        with torch.no_grad():
            logits = model(b_input_ids, b_attention_mask)
            logits1=model1(b_input_ids,b_attention_mask)
            logits2=model2(b_input_ids,b_attention_mask)
            
            logits = logits.view(-1, 1)
            logits1=logits.view(-1,1)
            logits2=logits2.view(-1,1)
            
            loss = criterion(logits, b_labels)
            val_loss += loss.item()
            
            logits = torch.sigmoid(logits)
            logits1= torch.sigmoid(logits1)
            logits2 = torch.sigmoid(logits2)
            
            logits = logits.to('cpu').detach().numpy().copy()
            logits1 = logits1.to('cpu').detach().numpy().copy()
            logits2 = logits2.to('cpu').detach().numpy().copy()
            
            output.extend(logits.tolist())
            logits = np.where(logits < border, 0, 1)
            logits1 = np.where(logits1 < border, 0, 1)
            logits2 = np.where(logits2 < border, 0, 1)
            
            labels = b_labels.to('cpu').detach().numpy().copy()
            
            pred.extend(logits)
            pred1.extend(logits1)
            pred2.extends(logit2)
            print(preds)
            print(preds.shape())
            true.extend(labels)
            
#     output0, output1 = [], []
#     for p, o in zip(true, output):
#         if p == 0:
#             output0.append(o[0])
#         else:
#             output1.append(o[0])
    
#     fig = plt.figure()
#     ax1 = fig.add_subplot(1,2,1)
#     ax1.hist(output0, bins=100, color='red', alpha=0.5)
#     ax1.set_xlabel('output')
#     ax1.set_ylabel('sum')
#     ax1.set_title('val_0')
    
#     ax2 = fig.add_subplot(1,2,2)
#     ax2.hist(output1, bins=100, color='blue', alpha=0.5)
#     ax2.set_xlabel('output')
#     ax2.set_ylabel('sum')
#     ax2.set_title('val_1')
    
#     fig.savefig('graph/val_scibert'+str(epoch)+'.png')
        
#     avg_val_loss = val_loss / len(val_dataloader)
#     print('Val loss:', avg_val_loss)
#     print('Val accuracy:', accuracy_score(true, pred))
    
    val_fbeta_score = fbeta_score(true, pred, beta=7.0)
    print('Val fbeta score:', val_fbeta_score)
    return val_fbeta_score

In [27]:
def run( val_dataloader, writer):
    
    torch.cuda.empty_cache()
    
    model = PubMedBert("dmis-lab/biobert-base-cased-v1.2")
    model.load_state_dict(torch.load("biobert_input_best_model.pt"))
    model=nn.DataParallel(model)
    model.to(device)
    
    model1=PubMedBert("emilyalsentzer/Bio_ClinicalBERT")
    model1.load_state_dict(torch.load("pubmedbert_input_best_model_biomed.pt"))
    model1=nn.DataParallel(model1)
    model1.to(device)
    
    model2=PubMedBert(path="bert-base-uncased")
    model2.load_state_dict(torch.load("bert.pt"))
    model2=nn.DataParallel(model2)
    model2.to(device)
    
    criterion = nn.BCEWithLogitsLoss()
    
#     if config.FULL_FINETUNING:
#         param_optimizer = list(model.named_parameters())
#         no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
#         optimizer_parameters = [
#             {
#                 "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
#                 "weight_decay": 0.001,
#             },
#             {
#                 "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
#                 "weight_decay": 0.0,
#             },
#         ]
#         optimizer = optim.AdamW(optimizer_parameters, lr=config.LR)
    
#     num_training_steps = len(train_dataloader) * config.EPOCHS
#     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    
    max_val_fbeta_score = float('-inf')
    for epoch in range(config.EPOCHS):
#         avg_train_loss, accuracy = train(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, epoch)
        max_val_fbeta_score = val(model,model1,model2, val_dataloader, criterion, epoch)
        
        best_model=copy.deepcopy(model)
#         writer.add_scalar('train_loss', avg_train_loss, epoch+1)
#         writer.add_scalar('accuracy', accuracy, epoch+1)
        writer.add_scalar('val_fbeta_score', val_fbeta_score, epoch+1)
        
#         if config.SAVE_BEST_ONLY:
#             if val_fbeta_score > max_val_fbeta_score:
#                 max_val_fbeta_score = val_fbeta_score
#                 best_model = copy.deepcopy(model)
                
    return best_model, max_val_fbeta_score

In [28]:
def cross_val():
    Fold = StratifiedKFold(n_splits=config.N_SPLIT, shuffle=True, random_state=seed)
    max_val_fbeta_score = float('-inf')
    
    for n, (train_indices, val_indices) in enumerate(Fold.split(train_df, train_df['judgement'])):
        print(f'========= fold: {n} training =========')
        
        log_dir = 'logs/fold_scibert'+str(n)
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        
        writer = SummaryWriter(log_dir=log_dir)

        train_data = TransformerDataset(train_df, train_indices)
        val_data = TransformerDataset(train_df, val_indices)
        
#         train_dataloader = DataLoader(train_data, batch_size=config.BATCH_SIZE)
        val_dataloader = DataLoader(val_data, batch_size=config.BATCH_SIZE)
                
        best_model, max_val_fbeta_score = run( val_dataloader, writer)
        
#         if config.SAVE_BEST_ONLY:
#             if fold_best_val_fbeta_score > max_val_fbeta_score:
#                 best_model = fold_best_model
#                 max_val_fbeta_score = fold_best_val_fbeta_score
                
#                 model_name = 'scibert_input_best_model'
#                 torch.save(best_model.state_dict(), model_name+'.pt')
        
        writer.close()
                
    return best_model, max_val_fbeta_score

In [29]:
device = config.DEVICE
device

device(type='cuda')

In [30]:
best_model, best_val_fbeta_score = cross_val()



Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing Bert

RuntimeError: Error(s) in loading state_dict for PubMedBert:
	Missing key(s) in state_dict: "model.embeddings.position_ids", "model.embeddings.word_embeddings.weight", "model.embeddings.position_embeddings.weight", "model.embeddings.token_type_embeddings.weight", "model.embeddings.LayerNorm.weight", "model.embeddings.LayerNorm.bias", "model.encoder.layer.0.attention.self.query.weight", "model.encoder.layer.0.attention.self.query.bias", "model.encoder.layer.0.attention.self.key.weight", "model.encoder.layer.0.attention.self.key.bias", "model.encoder.layer.0.attention.self.value.weight", "model.encoder.layer.0.attention.self.value.bias", "model.encoder.layer.0.attention.output.dense.weight", "model.encoder.layer.0.attention.output.dense.bias", "model.encoder.layer.0.attention.output.LayerNorm.weight", "model.encoder.layer.0.attention.output.LayerNorm.bias", "model.encoder.layer.0.intermediate.dense.weight", "model.encoder.layer.0.intermediate.dense.bias", "model.encoder.layer.0.output.dense.weight", "model.encoder.layer.0.output.dense.bias", "model.encoder.layer.0.output.LayerNorm.weight", "model.encoder.layer.0.output.LayerNorm.bias", "model.encoder.layer.1.attention.self.query.weight", "model.encoder.layer.1.attention.self.query.bias", "model.encoder.layer.1.attention.self.key.weight", "model.encoder.layer.1.attention.self.key.bias", "model.encoder.layer.1.attention.self.value.weight", "model.encoder.layer.1.attention.self.value.bias", "model.encoder.layer.1.attention.output.dense.weight", "model.encoder.layer.1.attention.output.dense.bias", "model.encoder.layer.1.attention.output.LayerNorm.weight", "model.encoder.layer.1.attention.output.LayerNorm.bias", "model.encoder.layer.1.intermediate.dense.weight", "model.encoder.layer.1.intermediate.dense.bias", "model.encoder.layer.1.output.dense.weight", "model.encoder.layer.1.output.dense.bias", "model.encoder.layer.1.output.LayerNorm.weight", "model.encoder.layer.1.output.LayerNorm.bias", "model.encoder.layer.2.attention.self.query.weight", "model.encoder.layer.2.attention.self.query.bias", "model.encoder.layer.2.attention.self.key.weight", "model.encoder.layer.2.attention.self.key.bias", "model.encoder.layer.2.attention.self.value.weight", "model.encoder.layer.2.attention.self.value.bias", "model.encoder.layer.2.attention.output.dense.weight", "model.encoder.layer.2.attention.output.dense.bias", "model.encoder.layer.2.attention.output.LayerNorm.weight", "model.encoder.layer.2.attention.output.LayerNorm.bias", "model.encoder.layer.2.intermediate.dense.weight", "model.encoder.layer.2.intermediate.dense.bias", "model.encoder.layer.2.output.dense.weight", "model.encoder.layer.2.output.dense.bias", "model.encoder.layer.2.output.LayerNorm.weight", "model.encoder.layer.2.output.LayerNorm.bias", "model.encoder.layer.3.attention.self.query.weight", "model.encoder.layer.3.attention.self.query.bias", "model.encoder.layer.3.attention.self.key.weight", "model.encoder.layer.3.attention.self.key.bias", "model.encoder.layer.3.attention.self.value.weight", "model.encoder.layer.3.attention.self.value.bias", "model.encoder.layer.3.attention.output.dense.weight", "model.encoder.layer.3.attention.output.dense.bias", "model.encoder.layer.3.attention.output.LayerNorm.weight", "model.encoder.layer.3.attention.output.LayerNorm.bias", "model.encoder.layer.3.intermediate.dense.weight", "model.encoder.layer.3.intermediate.dense.bias", "model.encoder.layer.3.output.dense.weight", "model.encoder.layer.3.output.dense.bias", "model.encoder.layer.3.output.LayerNorm.weight", "model.encoder.layer.3.output.LayerNorm.bias", "model.encoder.layer.4.attention.self.query.weight", "model.encoder.layer.4.attention.self.query.bias", "model.encoder.layer.4.attention.self.key.weight", "model.encoder.layer.4.attention.self.key.bias", "model.encoder.layer.4.attention.self.value.weight", "model.encoder.layer.4.attention.self.value.bias", "model.encoder.layer.4.attention.output.dense.weight", "model.encoder.layer.4.attention.output.dense.bias", "model.encoder.layer.4.attention.output.LayerNorm.weight", "model.encoder.layer.4.attention.output.LayerNorm.bias", "model.encoder.layer.4.intermediate.dense.weight", "model.encoder.layer.4.intermediate.dense.bias", "model.encoder.layer.4.output.dense.weight", "model.encoder.layer.4.output.dense.bias", "model.encoder.layer.4.output.LayerNorm.weight", "model.encoder.layer.4.output.LayerNorm.bias", "model.encoder.layer.5.attention.self.query.weight", "model.encoder.layer.5.attention.self.query.bias", "model.encoder.layer.5.attention.self.key.weight", "model.encoder.layer.5.attention.self.key.bias", "model.encoder.layer.5.attention.self.value.weight", "model.encoder.layer.5.attention.self.value.bias", "model.encoder.layer.5.attention.output.dense.weight", "model.encoder.layer.5.attention.output.dense.bias", "model.encoder.layer.5.attention.output.LayerNorm.weight", "model.encoder.layer.5.attention.output.LayerNorm.bias", "model.encoder.layer.5.intermediate.dense.weight", "model.encoder.layer.5.intermediate.dense.bias", "model.encoder.layer.5.output.dense.weight", "model.encoder.layer.5.output.dense.bias", "model.encoder.layer.5.output.LayerNorm.weight", "model.encoder.layer.5.output.LayerNorm.bias", "model.encoder.layer.6.attention.self.query.weight", "model.encoder.layer.6.attention.self.query.bias", "model.encoder.layer.6.attention.self.key.weight", "model.encoder.layer.6.attention.self.key.bias", "model.encoder.layer.6.attention.self.value.weight", "model.encoder.layer.6.attention.self.value.bias", "model.encoder.layer.6.attention.output.dense.weight", "model.encoder.layer.6.attention.output.dense.bias", "model.encoder.layer.6.attention.output.LayerNorm.weight", "model.encoder.layer.6.attention.output.LayerNorm.bias", "model.encoder.layer.6.intermediate.dense.weight", "model.encoder.layer.6.intermediate.dense.bias", "model.encoder.layer.6.output.dense.weight", "model.encoder.layer.6.output.dense.bias", "model.encoder.layer.6.output.LayerNorm.weight", "model.encoder.layer.6.output.LayerNorm.bias", "model.encoder.layer.7.attention.self.query.weight", "model.encoder.layer.7.attention.self.query.bias", "model.encoder.layer.7.attention.self.key.weight", "model.encoder.layer.7.attention.self.key.bias", "model.encoder.layer.7.attention.self.value.weight", "model.encoder.layer.7.attention.self.value.bias", "model.encoder.layer.7.attention.output.dense.weight", "model.encoder.layer.7.attention.output.dense.bias", "model.encoder.layer.7.attention.output.LayerNorm.weight", "model.encoder.layer.7.attention.output.LayerNorm.bias", "model.encoder.layer.7.intermediate.dense.weight", "model.encoder.layer.7.intermediate.dense.bias", "model.encoder.layer.7.output.dense.weight", "model.encoder.layer.7.output.dense.bias", "model.encoder.layer.7.output.LayerNorm.weight", "model.encoder.layer.7.output.LayerNorm.bias", "model.encoder.layer.8.attention.self.query.weight", "model.encoder.layer.8.attention.self.query.bias", "model.encoder.layer.8.attention.self.key.weight", "model.encoder.layer.8.attention.self.key.bias", "model.encoder.layer.8.attention.self.value.weight", "model.encoder.layer.8.attention.self.value.bias", "model.encoder.layer.8.attention.output.dense.weight", "model.encoder.layer.8.attention.output.dense.bias", "model.encoder.layer.8.attention.output.LayerNorm.weight", "model.encoder.layer.8.attention.output.LayerNorm.bias", "model.encoder.layer.8.intermediate.dense.weight", "model.encoder.layer.8.intermediate.dense.bias", "model.encoder.layer.8.output.dense.weight", "model.encoder.layer.8.output.dense.bias", "model.encoder.layer.8.output.LayerNorm.weight", "model.encoder.layer.8.output.LayerNorm.bias", "model.encoder.layer.9.attention.self.query.weight", "model.encoder.layer.9.attention.self.query.bias", "model.encoder.layer.9.attention.self.key.weight", "model.encoder.layer.9.attention.self.key.bias", "model.encoder.layer.9.attention.self.value.weight", "model.encoder.layer.9.attention.self.value.bias", "model.encoder.layer.9.attention.output.dense.weight", "model.encoder.layer.9.attention.output.dense.bias", "model.encoder.layer.9.attention.output.LayerNorm.weight", "model.encoder.layer.9.attention.output.LayerNorm.bias", "model.encoder.layer.9.intermediate.dense.weight", "model.encoder.layer.9.intermediate.dense.bias", "model.encoder.layer.9.output.dense.weight", "model.encoder.layer.9.output.dense.bias", "model.encoder.layer.9.output.LayerNorm.weight", "model.encoder.layer.9.output.LayerNorm.bias", "model.encoder.layer.10.attention.self.query.weight", "model.encoder.layer.10.attention.self.query.bias", "model.encoder.layer.10.attention.self.key.weight", "model.encoder.layer.10.attention.self.key.bias", "model.encoder.layer.10.attention.self.value.weight", "model.encoder.layer.10.attention.self.value.bias", "model.encoder.layer.10.attention.output.dense.weight", "model.encoder.layer.10.attention.output.dense.bias", "model.encoder.layer.10.attention.output.LayerNorm.weight", "model.encoder.layer.10.attention.output.LayerNorm.bias", "model.encoder.layer.10.intermediate.dense.weight", "model.encoder.layer.10.intermediate.dense.bias", "model.encoder.layer.10.output.dense.weight", "model.encoder.layer.10.output.dense.bias", "model.encoder.layer.10.output.LayerNorm.weight", "model.encoder.layer.10.output.LayerNorm.bias", "model.encoder.layer.11.attention.self.query.weight", "model.encoder.layer.11.attention.self.query.bias", "model.encoder.layer.11.attention.self.key.weight", "model.encoder.layer.11.attention.self.key.bias", "model.encoder.layer.11.attention.self.value.weight", "model.encoder.layer.11.attention.self.value.bias", "model.encoder.layer.11.attention.output.dense.weight", "model.encoder.layer.11.attention.output.dense.bias", "model.encoder.layer.11.attention.output.LayerNorm.weight", "model.encoder.layer.11.attention.output.LayerNorm.bias", "model.encoder.layer.11.intermediate.dense.weight", "model.encoder.layer.11.intermediate.dense.bias", "model.encoder.layer.11.output.dense.weight", "model.encoder.layer.11.output.dense.bias", "model.encoder.layer.11.output.LayerNorm.weight", "model.encoder.layer.11.output.LayerNorm.bias", "model.pooler.dense.weight", "model.pooler.dense.bias", "linear.weight", "linear.bias". 
	Unexpected key(s) in state_dict: "module.model.embeddings.position_ids", "module.model.embeddings.word_embeddings.weight", "module.model.embeddings.position_embeddings.weight", "module.model.embeddings.token_type_embeddings.weight", "module.model.embeddings.LayerNorm.weight", "module.model.embeddings.LayerNorm.bias", "module.model.encoder.layer.0.attention.self.query.weight", "module.model.encoder.layer.0.attention.self.query.bias", "module.model.encoder.layer.0.attention.self.key.weight", "module.model.encoder.layer.0.attention.self.key.bias", "module.model.encoder.layer.0.attention.self.value.weight", "module.model.encoder.layer.0.attention.self.value.bias", "module.model.encoder.layer.0.attention.output.dense.weight", "module.model.encoder.layer.0.attention.output.dense.bias", "module.model.encoder.layer.0.attention.output.LayerNorm.weight", "module.model.encoder.layer.0.attention.output.LayerNorm.bias", "module.model.encoder.layer.0.intermediate.dense.weight", "module.model.encoder.layer.0.intermediate.dense.bias", "module.model.encoder.layer.0.output.dense.weight", "module.model.encoder.layer.0.output.dense.bias", "module.model.encoder.layer.0.output.LayerNorm.weight", "module.model.encoder.layer.0.output.LayerNorm.bias", "module.model.encoder.layer.1.attention.self.query.weight", "module.model.encoder.layer.1.attention.self.query.bias", "module.model.encoder.layer.1.attention.self.key.weight", "module.model.encoder.layer.1.attention.self.key.bias", "module.model.encoder.layer.1.attention.self.value.weight", "module.model.encoder.layer.1.attention.self.value.bias", "module.model.encoder.layer.1.attention.output.dense.weight", "module.model.encoder.layer.1.attention.output.dense.bias", "module.model.encoder.layer.1.attention.output.LayerNorm.weight", "module.model.encoder.layer.1.attention.output.LayerNorm.bias", "module.model.encoder.layer.1.intermediate.dense.weight", "module.model.encoder.layer.1.intermediate.dense.bias", "module.model.encoder.layer.1.output.dense.weight", "module.model.encoder.layer.1.output.dense.bias", "module.model.encoder.layer.1.output.LayerNorm.weight", "module.model.encoder.layer.1.output.LayerNorm.bias", "module.model.encoder.layer.2.attention.self.query.weight", "module.model.encoder.layer.2.attention.self.query.bias", "module.model.encoder.layer.2.attention.self.key.weight", "module.model.encoder.layer.2.attention.self.key.bias", "module.model.encoder.layer.2.attention.self.value.weight", "module.model.encoder.layer.2.attention.self.value.bias", "module.model.encoder.layer.2.attention.output.dense.weight", "module.model.encoder.layer.2.attention.output.dense.bias", "module.model.encoder.layer.2.attention.output.LayerNorm.weight", "module.model.encoder.layer.2.attention.output.LayerNorm.bias", "module.model.encoder.layer.2.intermediate.dense.weight", "module.model.encoder.layer.2.intermediate.dense.bias", "module.model.encoder.layer.2.output.dense.weight", "module.model.encoder.layer.2.output.dense.bias", "module.model.encoder.layer.2.output.LayerNorm.weight", "module.model.encoder.layer.2.output.LayerNorm.bias", "module.model.encoder.layer.3.attention.self.query.weight", "module.model.encoder.layer.3.attention.self.query.bias", "module.model.encoder.layer.3.attention.self.key.weight", "module.model.encoder.layer.3.attention.self.key.bias", "module.model.encoder.layer.3.attention.self.value.weight", "module.model.encoder.layer.3.attention.self.value.bias", "module.model.encoder.layer.3.attention.output.dense.weight", "module.model.encoder.layer.3.attention.output.dense.bias", "module.model.encoder.layer.3.attention.output.LayerNorm.weight", "module.model.encoder.layer.3.attention.output.LayerNorm.bias", "module.model.encoder.layer.3.intermediate.dense.weight", "module.model.encoder.layer.3.intermediate.dense.bias", "module.model.encoder.layer.3.output.dense.weight", "module.model.encoder.layer.3.output.dense.bias", "module.model.encoder.layer.3.output.LayerNorm.weight", "module.model.encoder.layer.3.output.LayerNorm.bias", "module.model.encoder.layer.4.attention.self.query.weight", "module.model.encoder.layer.4.attention.self.query.bias", "module.model.encoder.layer.4.attention.self.key.weight", "module.model.encoder.layer.4.attention.self.key.bias", "module.model.encoder.layer.4.attention.self.value.weight", "module.model.encoder.layer.4.attention.self.value.bias", "module.model.encoder.layer.4.attention.output.dense.weight", "module.model.encoder.layer.4.attention.output.dense.bias", "module.model.encoder.layer.4.attention.output.LayerNorm.weight", "module.model.encoder.layer.4.attention.output.LayerNorm.bias", "module.model.encoder.layer.4.intermediate.dense.weight", "module.model.encoder.layer.4.intermediate.dense.bias", "module.model.encoder.layer.4.output.dense.weight", "module.model.encoder.layer.4.output.dense.bias", "module.model.encoder.layer.4.output.LayerNorm.weight", "module.model.encoder.layer.4.output.LayerNorm.bias", "module.model.encoder.layer.5.attention.self.query.weight", "module.model.encoder.layer.5.attention.self.query.bias", "module.model.encoder.layer.5.attention.self.key.weight", "module.model.encoder.layer.5.attention.self.key.bias", "module.model.encoder.layer.5.attention.self.value.weight", "module.model.encoder.layer.5.attention.self.value.bias", "module.model.encoder.layer.5.attention.output.dense.weight", "module.model.encoder.layer.5.attention.output.dense.bias", "module.model.encoder.layer.5.attention.output.LayerNorm.weight", "module.model.encoder.layer.5.attention.output.LayerNorm.bias", "module.model.encoder.layer.5.intermediate.dense.weight", "module.model.encoder.layer.5.intermediate.dense.bias", "module.model.encoder.layer.5.output.dense.weight", "module.model.encoder.layer.5.output.dense.bias", "module.model.encoder.layer.5.output.LayerNorm.weight", "module.model.encoder.layer.5.output.LayerNorm.bias", "module.model.encoder.layer.6.attention.self.query.weight", "module.model.encoder.layer.6.attention.self.query.bias", "module.model.encoder.layer.6.attention.self.key.weight", "module.model.encoder.layer.6.attention.self.key.bias", "module.model.encoder.layer.6.attention.self.value.weight", "module.model.encoder.layer.6.attention.self.value.bias", "module.model.encoder.layer.6.attention.output.dense.weight", "module.model.encoder.layer.6.attention.output.dense.bias", "module.model.encoder.layer.6.attention.output.LayerNorm.weight", "module.model.encoder.layer.6.attention.output.LayerNorm.bias", "module.model.encoder.layer.6.intermediate.dense.weight", "module.model.encoder.layer.6.intermediate.dense.bias", "module.model.encoder.layer.6.output.dense.weight", "module.model.encoder.layer.6.output.dense.bias", "module.model.encoder.layer.6.output.LayerNorm.weight", "module.model.encoder.layer.6.output.LayerNorm.bias", "module.model.encoder.layer.7.attention.self.query.weight", "module.model.encoder.layer.7.attention.self.query.bias", "module.model.encoder.layer.7.attention.self.key.weight", "module.model.encoder.layer.7.attention.self.key.bias", "module.model.encoder.layer.7.attention.self.value.weight", "module.model.encoder.layer.7.attention.self.value.bias", "module.model.encoder.layer.7.attention.output.dense.weight", "module.model.encoder.layer.7.attention.output.dense.bias", "module.model.encoder.layer.7.attention.output.LayerNorm.weight", "module.model.encoder.layer.7.attention.output.LayerNorm.bias", "module.model.encoder.layer.7.intermediate.dense.weight", "module.model.encoder.layer.7.intermediate.dense.bias", "module.model.encoder.layer.7.output.dense.weight", "module.model.encoder.layer.7.output.dense.bias", "module.model.encoder.layer.7.output.LayerNorm.weight", "module.model.encoder.layer.7.output.LayerNorm.bias", "module.model.encoder.layer.8.attention.self.query.weight", "module.model.encoder.layer.8.attention.self.query.bias", "module.model.encoder.layer.8.attention.self.key.weight", "module.model.encoder.layer.8.attention.self.key.bias", "module.model.encoder.layer.8.attention.self.value.weight", "module.model.encoder.layer.8.attention.self.value.bias", "module.model.encoder.layer.8.attention.output.dense.weight", "module.model.encoder.layer.8.attention.output.dense.bias", "module.model.encoder.layer.8.attention.output.LayerNorm.weight", "module.model.encoder.layer.8.attention.output.LayerNorm.bias", "module.model.encoder.layer.8.intermediate.dense.weight", "module.model.encoder.layer.8.intermediate.dense.bias", "module.model.encoder.layer.8.output.dense.weight", "module.model.encoder.layer.8.output.dense.bias", "module.model.encoder.layer.8.output.LayerNorm.weight", "module.model.encoder.layer.8.output.LayerNorm.bias", "module.model.encoder.layer.9.attention.self.query.weight", "module.model.encoder.layer.9.attention.self.query.bias", "module.model.encoder.layer.9.attention.self.key.weight", "module.model.encoder.layer.9.attention.self.key.bias", "module.model.encoder.layer.9.attention.self.value.weight", "module.model.encoder.layer.9.attention.self.value.bias", "module.model.encoder.layer.9.attention.output.dense.weight", "module.model.encoder.layer.9.attention.output.dense.bias", "module.model.encoder.layer.9.attention.output.LayerNorm.weight", "module.model.encoder.layer.9.attention.output.LayerNorm.bias", "module.model.encoder.layer.9.intermediate.dense.weight", "module.model.encoder.layer.9.intermediate.dense.bias", "module.model.encoder.layer.9.output.dense.weight", "module.model.encoder.layer.9.output.dense.bias", "module.model.encoder.layer.9.output.LayerNorm.weight", "module.model.encoder.layer.9.output.LayerNorm.bias", "module.model.encoder.layer.10.attention.self.query.weight", "module.model.encoder.layer.10.attention.self.query.bias", "module.model.encoder.layer.10.attention.self.key.weight", "module.model.encoder.layer.10.attention.self.key.bias", "module.model.encoder.layer.10.attention.self.value.weight", "module.model.encoder.layer.10.attention.self.value.bias", "module.model.encoder.layer.10.attention.output.dense.weight", "module.model.encoder.layer.10.attention.output.dense.bias", "module.model.encoder.layer.10.attention.output.LayerNorm.weight", "module.model.encoder.layer.10.attention.output.LayerNorm.bias", "module.model.encoder.layer.10.intermediate.dense.weight", "module.model.encoder.layer.10.intermediate.dense.bias", "module.model.encoder.layer.10.output.dense.weight", "module.model.encoder.layer.10.output.dense.bias", "module.model.encoder.layer.10.output.LayerNorm.weight", "module.model.encoder.layer.10.output.LayerNorm.bias", "module.model.encoder.layer.11.attention.self.query.weight", "module.model.encoder.layer.11.attention.self.query.bias", "module.model.encoder.layer.11.attention.self.key.weight", "module.model.encoder.layer.11.attention.self.key.bias", "module.model.encoder.layer.11.attention.self.value.weight", "module.model.encoder.layer.11.attention.self.value.bias", "module.model.encoder.layer.11.attention.output.dense.weight", "module.model.encoder.layer.11.attention.output.dense.bias", "module.model.encoder.layer.11.attention.output.LayerNorm.weight", "module.model.encoder.layer.11.attention.output.LayerNorm.bias", "module.model.encoder.layer.11.intermediate.dense.weight", "module.model.encoder.layer.11.intermediate.dense.bias", "module.model.encoder.layer.11.output.dense.weight", "module.model.encoder.layer.11.output.dense.bias", "module.model.encoder.layer.11.output.LayerNorm.weight", "module.model.encoder.layer.11.output.LayerNorm.bias", "module.model.pooler.dense.weight", "module.model.pooler.dense.bias", "module.linear.weight", "module.linear.bias". 