In [1]:
# Import necessary packages
import torch 
print(torch.__version__)
import transformers
print(transformers.__version__)

import numpy as np
import pandas as pd
import os

1.11.0
4.20.0.dev0


In [21]:
from sklearn.model_selection import train_test_split
from torch.utils.data import WeightedRandomSampler


## Load tokenizer

In [3]:
from transformers import AutoTokenizer
# specify where the model docs are, note that you added "model_type":"bert" in line 1 of json file
tokenizer = AutoTokenizer.from_pretrained('../model/UCSF BERT-500k+275k-pytorch/', do_lower_case=False)

## Parragraph/doc level classification

1. Preparing classification dataset

In [4]:
class RadNotes(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)


from torch import nn
from transformers import Trainer


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([class_weights[1], 
                                                            class_weights[0]]))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

1.1 Get train, dev and test set texts and labels. Best way to do this is to load through csv or dataframe.

In [5]:
dataPath = '../data_200/recombined/'
dataList = os.listdir(dataPath)

In [6]:
abFinds_liver = pd.read_csv(dataPath + dataList[0])
abFinds_pancreas = pd.read_csv(dataPath + dataList[2])
# fix labels
abFinds_liver['label'] = [abFinds_liver['label'][j][2:-2]for j,i in enumerate(abFinds_liver['label'])]
abFinds_pancreas['label'] = [abFinds_pancreas['label'][j][2:-2]for j,i in enumerate(abFinds_pancreas['label'])]

# map to ints
binaryMap = {'Present': 1, 'Absent': 0}
abFinds_liver['label'] = abFinds_liver['label'].map(binaryMap)
abFinds_pancreas['label'] = abFinds_pancreas['label'].map(binaryMap)

# concatenate abnormal findings for pancreas + liver
abnormal_findings = pd.concat([abFinds_liver, abFinds_pancreas])
abnormal_findings.head()

Unnamed: 0,idx,sentence,label
0,5e4619235808eee774f5728d,"Distended stomach and proximal duodenum, sim...",0
1,5ebd8129c206f20a8b300bfe,***** of the enteric tube terminates within ...,0
2,5fddef39eaf0903792b27112,Unremarkable Pelvis: Unremarkable \r\n,0
3,5f5aaa30d8d9aa5228b83083,No bowel obstruction. Small fat-containing u...,0
4,5e463dc95808eee774ab89b0,Colonic diverticulosis. Pelvis: Unremarkable...,0


2. Trying the train_test_split method from sklearn with a pandas dataframe passed in.


In [7]:
X_abfinds = abnormal_findings['sentence']
y_abfinds = abnormal_findings['label']

X_train, X_test, y_train, y_test = train_test_split(X_abfinds, y_abfinds, random_state=42)

In [8]:
print("(Present, Absent) -- Training")
sum(y_train[y_train == 1]), len(y_train[y_train != 1])

(Present, Absent) -- Training


(42, 258)

In [9]:
print("(Present, Absent) -- Testing")
sum(y_test[y_test == 1]), len(y_test[y_test != 1])

(Present, Absent) -- Testing


(10, 90)

**Note**
- Important to show this as a key step in presentation.
- A good idea would be to show how the regeneration of the splits changes the performance of the algorithm.
- Keep for now liver and pancreas for all 5 applications
- Analyze all performances and compare. 

2.1 Adding weights to each class to introduce low dist. penalization.
Resources used: 

In [28]:
class_sample_count = np.array(
    [len(np.where(y_train == t)[0]) for t in np.unique(y_train)])
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in y_train])
samples_weight = torch.from_numpy(samples_weight)
samples_weigth = samples_weight.double()
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

3. Using tokenizer for the data we just split.

In [33]:
# encoding texts with the tokenizer
train_encodings = tokenizer(X_train.tolist(), max_length = 512, truncation=True, padding=True)
val_encodings = tokenizer(X_test.tolist(), max_length=512, truncation=True, padding=True)

4. Create a dataset from the texts

In [34]:
train_dataset = RadNotes(train_encodings, y_train.tolist())
val_dataset = RadNotes(val_encodings, y_test.tolist())

5. Now specify training parameters


In [35]:
batch_size = 32
n_epochs = 3
learning_rate = 5e-5
warmup_steps = 0 # what is this?

6. Now the training classifier

In [36]:
# importing some more dependencies
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# now choosing GPU if available, if not choosing CPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')

# now for the model---- load
model = AutoModelForSequenceClassification.from_pretrained('../model/UCSF BERT-500k+275k-pytorch/') 
model.to(device) # model moved to available device
model.train() # set to training mode

train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
num_steps = len(train_loader) // n_epochs #total training in batch div by num of epochs

# optimization
optim =AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(
                optim, num_warmup_steps = warmup_steps, num_training_steps = num_steps
            )

# iterate for training
for epoch in range(n_epochs): # for each available epoch
    for batch in train_loader: # for b in train_loader
        optim.zero_grad() # gradient @ zero?
        input_ids = batch['input_ids'].to(device) # keeping track of ids in device
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # save loss from the outputs of the model
        loss=outputs[0]
        
        # now backprop
        loss.backward()
        
        optim.step()
        scheduler.step() # updating learning rate schedule 
        model.zero_grad()

Some weights of the model checkpoint at ../model/UCSF BERT-500k+275k-pytorch/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificati

7. Evaluating the model

In [37]:
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
model.eval()

preds, all_labels = None, None

for batch in val_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    
    outputs = model(input_ids, attention_mask = attention_mask)
    logits = outputs.logits
    
    if preds is None:
        preds = logits.detach().cpu().numpy()
        all_labels = batch['labels'].detach().cpu().numpy()
    else:
        preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
        all_labels = np.append(all_labels, batch['labels'].detach().cpu().numpy(), axis=0)

preds = np.argmax(preds, axis=1)

8. Computing evaluation metrics

In [38]:

# Compute evaluation metrics
# Define desirable evaluation metrics

from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from transformers.data.metrics import simple_accuracy

def multiclass_acc_and_f1(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds, )
    prec = precision_score(y_true=labels, y_pred=preds, )
    recall = recall_score(y_true=labels, y_pred=preds, )
    macro_f1 = f1_score(y_true=labels, y_pred=preds, average='macro')
    macro_weighted_f1 = f1_score(y_true=labels, y_pred=preds, average='weighted')
    macro_precision = precision_score(y_true=labels, y_pred=preds, average='macro')
    macro_weighted_precision = precision_score(y_true=labels, y_pred=preds, average='weighted')
    macro_recall = recall_score(y_true=labels, y_pred=preds, average='macro')
    macro_weighted_recall = recall_score(y_true=labels, y_pred=preds, average='weighted')
    micro_f1 = f1_score(y_true=labels, y_pred=preds, average='micro')
    confusion = confusion_matrix(y_true=labels, y_pred=preds)
    return {
        "acc": acc,
        "f1": f1,
        "precision": prec,
        "recall": recall,
        'micro_f1': micro_f1,
        "macro_f1": macro_f1,
        "macro_weighted_f1": macro_weighted_f1,
        "macro_precision": macro_precision,
        "macro_weighted_precision": macro_weighted_precision,
        "macro_recall": macro_recall,
        "macro_weighted_recall": macro_weighted_recall,
        "confusion_matrix": confusion,
    }

result = multiclass_acc_and_f1(preds, all_labels)

print("Result: ", result)

Result:  {'acc': 0.83, 'f1': 0.1904761904761905, 'precision': 0.18181818181818182, 'recall': 0.2, 'micro_f1': 0.83, 'macro_f1': 0.5477520617185422, 'macro_weighted_f1': 0.8335727587124235, 'macro_precision': 0.5459652706843718, 'macro_weighted_precision': 0.8372829417773238, 'macro_recall': 0.55, 'macro_weighted_recall': 0.83, 'confusion_matrix': array([[81,  9],
       [ 8,  2]])}




In [39]:
preds

array([0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [40]:
all_labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
len(X_test)

In [None]:
pd.DataFrame.from_dict(result, index=[0])

In [None]:
preds, all_labels

1. Testable hypothesis: combining classifiers, does that make them improve or worsen? (separate for each classifier)
    - One model per organ, and compare by combining all classifiers 
    - good to test
    - increased power will outweigh fact of us having less specificity
2. Focus on Previous surgeries, abnormal findings, and disease_location
3. Once we have a classifier
    - LIT or tensorboard
    - trying to understand the word enrichment in one group vs another
        - f1 scores do not inform us much
    - send him the list afterwards for his insight
        - test outputs