<a href="https://colab.research.google.com/github/srijayashree/CS598/blob/main/Model_finetuning_2tests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import torch


In [21]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [22]:
from transformers import BertTokenizer, BertForMaskedLM, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
model = BertForMaskedLM.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
import pandas as pd
import csv



In [24]:
df = pd.read_csv('/content/patient_wICD_shortconcat_4000.csv')

In [25]:
my_list = df['SHORT_CONCAT'].tolist()
#tokenized_text = df.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
tokenized_list = [tokenizer.encode(text, add_special_tokens=True) for text in my_list]

In [26]:
# Find the maximum length of the tokenized sequences
max_len = max(len(tokenized_seq) for tokenized_seq in tokenized_list)

print("Max length:", max_len)

Max length: 86


In [27]:
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

class TextDataset(Dataset):
    def __init__(self, tokenized_text):
        self.tokenized_text = tokenized_text

    def __len__(self):
        return len(self.tokenized_text)

    def __getitem__(self, idx):
        return torch.tensor(self.tokenized_text[idx])

# pad the tokenized sequences
padded_text = pad_sequence([torch.tensor(x[:max_len]) for x in tokenized_list], 
                           batch_first=True, padding_value=0)

#padded_tensor = torch.tensor(padded_text)

# dataset = TextDataset(tokenized_list)
# create the dataset from the padded sequences
dataset = torch.utils.data.TensorDataset(padded_text)

# create the DataLoader object with batch size and shuffle settings
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)





In [28]:
from transformers import AdamW

#optimizer = AdamW(model.parameters(), lr=4e-5)
optimizer = torch.optim.Adam(model.parameters(),lr=4e-5)

model.train()

for epoch in range(1):
    for batch in dataloader:
        # batch = torch.tensor(batch)
        optimizer.zero_grad()
        outputs = model(batch[0], labels=batch[0])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [11]:
model.save_pretrained('fine_tuned_model')

In [13]:
import torch.nn as nn

# Define the decoder
hidden_size = model.config.hidden_size
hidden_size
num_labels = 2 # heart failure & diabetes
decoder = nn.Linear(hidden_size, num_labels)



In [14]:
hidden_size

768

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

class sEHR_CE(nn.Module):
    def __init__(self, num_classes=2, learning_rate=1e-5):
        super(sEHR_CE, self).__init__()
        self.num_classes = num_classes
        self.learning_rate = learning_rate
        self.tokenizer = BertTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
        self.sehr = BertForSequenceClassification.from_pretrained('fine_tuned_model', output_hidden_states=True)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(self.sehr.config.hidden_size, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        output = self.sehr(input_ids=input_ids, attention_mask=attention_mask)
        #print(output)
        last_hidden_state = output.hidden_states[-1]  # get the last hidden state
        cls_hidden_state = last_hidden_state[:, 0, :]
        dropout = self.dropout(cls_hidden_state)
        linear = self.linear(dropout)
        return self.sigmoid(linear)

    def train_model(self, train_dataloader, val_dataloader, num_epochs=3):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        total_steps = len(train_dataloader) * num_epochs
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=self.learning_rate,
                                                  total_steps=total_steps)

        for epoch in range(num_epochs):
            self.train()
            train_losses = []
            for step, batch in enumerate(train_dataloader):
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                #labels = batch['labels']
                labels = torch.stack([torch.tensor(batch['HF'], dtype=torch.float),
                                      torch.tensor(batch['Diabetes'], dtype=torch.float)], dim=1)

                outputs = self(input_ids=input_ids, attention_mask=attention_mask)
                loss = nn.BCELoss()(outputs, labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                scheduler.step()

                train_losses.append(loss.item())

            train_loss = sum(train_losses) / len(train_losses)

            self.eval()
            val_losses = []
            val_accuracies = []
            with torch.no_grad():
                for batch in val_dataloader:
                    input_ids = batch['input_ids']
                    attention_mask = batch['attention_mask']
                    #labels = batch['labels']
                    labels = torch.stack([torch.tensor(batch['HF'], dtype=torch.float),
                                      torch.tensor(batch['Diabetes'], dtype=torch.float)], dim=1)

                    outputs = self(input_ids=input_ids, attention_mask=attention_mask)
                    loss = nn.BCELoss()(outputs, labels)

                    val_losses.append(loss.item())

                    predicted_labels = (outputs > 0.5).float()
                    accuracy = (predicted_labels == labels).float().mean()
                    val_accuracies.append(accuracy.item())

            val_loss = sum(val_losses) / len(val_losses)
            val_accuracy = sum(val_accuracies) / len(val_accuracies)

            print(f"Epoch {epoch+1}/{num_epochs} - "
                  f"Train loss: {train_loss:.4f} - "
                  f"Val loss: {val_loss:.4f} - "
                  f"Val accuracy: {val_accuracy:.4f}")

    def eval_model(self, test_dataloader):
        self.eval()
        # test_losses = []
        # test_accuracies = []
        # self.model.eval()
        test_loss = 0
        total_preds = []
        total_labels = []

        with torch.no_grad():
            for batch in test_dataloader:
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                #labels = batch['labels']
                labels = torch.stack([torch.tensor(batch['HF'], dtype=torch.float),
                                      torch.tensor(batch['Diabetes'], dtype=torch.float)], dim=1)

                outputs = self(input_ids=input_ids,attention_mask=attention_mask)
                                 #labels=labels)
                loss = nn.BCELoss()(outputs, labels)
            
                #loss = outputs.loss
                #logits = outputs.logits
                test_loss += loss.item()

                #preds = torch.sigmoid(logits)
                #preds = outputs.numpy()
                #labels = labels.numpy()

                total_preds.append(outputs)
                total_labels.append(labels)

        avg_test_loss = test_loss / len(test_dataloader)
        total_preds = np.concatenate(total_preds, axis=0)
        total_labels = np.concatenate(total_labels, axis=0)

        return avg_test_loss, total_preds, total_labels

In [40]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, RandomSampler

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.tokenizer = tokenizer
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data['SHORT_CONCAT'][idx]
        #labels = [self.data['HF'][idx], self.data['Diabetes'][idx]]
        HF = self.data['HF'][idx]
        Diabetes = self.data['Diabetes'][idx]
        
        # tokenize the text
        encoded_text = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = encoded_text['input_ids'][0]
        attention_mask = encoded_text['attention_mask'][0]
        #return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'HF': HF, 'Diabetes': Diabetes}

# load the data from the CSV files
train_df = pd.read_csv('patient_split_shortmask_1_6000.csv')
valid_df = pd.read_csv('patient_split_shortmask_4_4000.csv')
test_df = pd.read_csv('patient_split_shortmask_3_4000.csv')

# create the tokenizer
tokenizer = BertTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')

# create the datasets
train_dataset = TextDataset(train_df, tokenizer)
valid_dataset = TextDataset(valid_df, tokenizer)
test_dataset = TextDataset(test_df, tokenizer)

# create the dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [19]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score, precision_recall_curve, auc
# number of epochs to train the model
n_epochs = 2

# create an instance of the sEHR_CE class
model1 = sEHR_CE()

# train the model
model1.train_model(train_dataloader, valid_dataloader, n_epochs)

# evaluate the model on the test set
avg_test_loss, total_preds, total_labels = model1.eval_model(test_dataloader)

# calculate recall at 0.5
pred_labels = (total_preds > 0.5).astype(int)
recall = recall_score(total_labels, pred_labels, average=None)

# calculate AUC
auc = roc_auc_score(total_labels, total_preds, average=None)

print("Recall at 0.5:", recall)
print("AUC:", auc)

total_labels_df = pd.DataFrame(total_labels)
total_labels_df.to_csv('/content/total_labels.csv', index=False)

total_preds_df = pd.DataFrame(total_preds)
total_preds_df.to_csv('/content/total_preds_v2.csv', index=False)




# 2nd set of tests with the disease terms removed
test_df2 = pd.read_csv('patient_split_shortremoved_3_4000.csv')
test_dataset2 = TextDataset(test_df2, tokenizer)
test_dataloader2 = DataLoader(test_dataset2, batch_size=32, shuffle=False)

# evaluate the model on the test set
avg_test_loss2, total_preds2, total_labels2 = model1.eval_model(test_dataloader2)

# calculate recall at 0.5
pred_labels2 = (total_preds2 > 0.5).astype(int)
recall2 = recall_score(total_labels2, pred_labels2, average=None)

# calculate AUC
auc2 = roc_auc_score(total_labels2, total_preds2, average=None)

print("Recall2 at 0.5:", recall2)
print("AUC2:", auc2)

total_labels_df2 = pd.DataFrame(total_labels2)
total_labels_df2.to_csv('/content/total_labels_removed.csv', index=False)

total_preds_df2 = pd.DataFrame(total_preds2)
total_preds_df2.to_csv('/content/total_preds_removed.csv', index=False)



Some weights of the model checkpoint at fine_tuned_model were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at fine_tuned_model and are new

Epoch 1/2 - Train loss: 0.4499 - Val loss: 0.2283 - Val accuracy: 0.9107
Epoch 2/2 - Train loss: 0.2370 - Val loss: 0.2139 - Val accuracy: 0.9083


  labels = torch.stack([torch.tensor(batch['HF'], dtype=torch.float),
  torch.tensor(batch['Diabetes'], dtype=torch.float)], dim=1)


Recall at 0.5: [0.4478595  0.96480583]
AUC: [0.84760337 0.99617169]


  labels = torch.stack([torch.tensor(batch['HF'], dtype=torch.float),
  torch.tensor(batch['Diabetes'], dtype=torch.float)], dim=1)


ValueError: ignored

In [36]:
tokenizer

BertTokenizer(name_or_path='microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [29]:
precision = precision_score(total_labels, pred_labels, average=None)
recall = recall_score(total_labels, pred_labels, average=None)

# calculate AUC
auc = roc_auc_score(total_labels, total_preds, average=None)

print("Recall at 0.5:", recall)
print("AUC:", auc)
print("Avg test loss:", avg_test_loss)

Recall at 0.5: [0.4478595  0.96480583]
AUC: [0.84760337 0.99617169]
Avg test loss: 0.21502231754129753


In [30]:
print(total_labels)

[[0. 0.]
 [0. 0.]
 [0. 0.]
 ...
 [0. 1.]
 [0. 0.]
 [0. 0.]]


In [31]:
total_labels_df = pd.DataFrame(total_labels)
total_labels_df.to_csv('/content/total_labels.csv', index=False)

# New Section

In [42]:
print(total_preds)

[[0.1365695  0.00739421]
 [0.21475925 0.01261922]
 [0.21051845 0.00665803]
 ...
 [0.30202112 0.9932414 ]
 [0.03938677 0.01561123]
 [0.1079238  0.00450369]]


In [33]:
total_preds_df = pd.DataFrame(total_preds)
total_preds_df.to_csv('/content/total_preds_v2.csv', index=False)


In [41]:
# 2nd set of tests with the disease terms removed
test_df2 = pd.read_csv('patient_split_shortremoved_3_4000.csv')
test_dataset2 = TextDataset(test_df2, tokenizer)
test_dataloader2 = DataLoader(test_dataset2, batch_size=32, shuffle=False)

# evaluate the model on the test set
avg_test_loss2, total_preds2, total_labels2 = model1.eval_model(test_dataloader2)

# calculate recall at 0.5
pred_labels2 = (total_preds2 > 0.5).astype(int)
recall2 = recall_score(total_labels2, pred_labels2, average=None)

# calculate AUC
auc2 = roc_auc_score(total_labels2, total_preds2, average=None)

print("Recall2 at 0.5:", recall2)
print("AUC2:", auc2)

total_labels_df2 = pd.DataFrame(total_labels2)
total_labels_df2.to_csv('/content/total_labels_removed.csv', index=False)

total_preds_df2 = pd.DataFrame(total_preds2)
total_preds_df2.to_csv('/content/total_preds_removed.csv', index=False)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  labels = torch.stack([torch.tensor(batch['HF'], dtype=torch.float),
  torch.tensor(batch['Diabetes'], dtype=torch.float)], dim=1)


Recall2 at 0.5: [0.54945055 0.00121951]
AUC2: [0.84552347 0.53127876]
