In [1]:
import numpy as np
from transformers import AutoTokenizer, AutoModel, DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, TensorDataset, DataLoader
from transformers import get_scheduler
from eval_metrics import print_metrics_binary
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

read data

In [2]:
t = pd.read_csv("MP_IN_adm.csv")
id_list = np.array(t['id'].astype(int))  # id, int
text_list = t['text'].astype(str).tolist()   # text, str
label_list = torch.tensor(t['hospital_expire_flag'].astype(int))   # hospital_expipre_flag, int


datasize = 2000
id_list = id_list[:datasize]
text_list = text_list[:datasize]
label_list = label_list[:datasize]

Model, followed by a Classifier for binaryClassification

In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# classifier = nn.Sequential(
#     nn.Linear(in_features=768, out_features=2, bias=True),
#     nn.Softmax(dim=1)
# )
# model = nn.Sequential(model, classifier)
Classifier = nn.Sequential(
    nn.Linear(in_features=768, out_features=2, bias=True),
    nn.Softmax(dim=1)
)
model

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [4]:
LR = 1e-5
Epoch = 2
batch_size = 64
criterion = torch.nn.BCELoss(weight=None, size_average=None, reduce=None, reduction='mean')
optimizer = torch.optim.AdamW([{'params': model.parameters()}, {'params': Classifier.parameters()}], lr=LR)

In [5]:
class TextDataset(Dataset):
    def __init__(self, id, tokenized_texts, flag):
        self.id = id
        self.tokenized_texts = tokenized_texts
        self.flag = flag

    def __len__(self):
        return len(self.flag)

    def __getitem__(self, index):
        # inputs for model from "transformers" should be model(input_ids, attention_mask)

        # return "id" of patient, "input_ids" of tokenized texts, "attention_mask" of tokenized texts, "y_true" of patient
        return self.id[index], self.tokenized_texts[index], self.flag[index]

In [6]:
tokenized_texts = tokenizer(text_list, padding=True, truncation=True, return_tensors="pt")
train_dataset = TextDataset(id_list, tokenized_texts["input_ids"], label_list)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

id, input_ids, flag = next(iter(train_loader))
print(id.shape)
print(input_ids.shape)
print(flag.shape)

torch.Size([64])
torch.Size([64, 512])
torch.Size([64])


In [7]:
tokenized_texts = tokenizer(text_list[:100], padding=True, truncation=True, return_tensors="pt")
sequence = tokenizer(["helodo"*100]*100, padding=True, truncation=True, return_tensors="pt")
print(tokenized_texts["input_ids"])
print(sequence["input_ids"])


tensor([[  101,  2708, 12087,  ..., 19842,  2050,   102],
        [  101,  2708, 12087,  ...,  2005, 10882,   102],
        [  101,  2708, 12087,  ...,  9634,  1010,   102],
        ...,
        [  101,  2708, 12087,  ...,     0,     0,     0],
        [  101,  2708, 12087,  ...,  1008,  2902,   102],
        [  101,  2708, 12087,  ..., 16215, 21716,   102]])
tensor([[101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101, 100, 102],
        [101,

In [8]:
pred = model(sequence["input_ids"])
pred = Classifier(pred.pooler_output)
pred

tensor([[0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4834, 0.5166],
        [0.4

In [9]:
pred = model(tokenized_texts["input_ids"])
pred = Classifier(pred.pooler_output)
pred

IndexError: index out of range in self