In [1]:
# Import necessary packages
import torch 
#print(torch.__version__)
import transformers
#print(transformers.__version__)

import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# importing some more dependencies
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup

from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from transformers.data.metrics import simple_accuracy
from torch.utils.data import WeightedRandomSampler

### Load tokenizer
This will be the same tokenizer for everything

In [2]:
tokenizer = AutoTokenizer.from_pretrained('../model/UCSF BERT-500k+275k-pytorch/', do_lower_case=False)

### Classification Class

In [3]:
class RadNotes(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [4]:
def cleanDocs(docList):
    '''
    Clean up labels and load as csv's.
    Labels are changed to binary.
    '''
    dfs = []
    binary_Map = {'Present': 1, 'Absent': 0}
    for doc in docList:
        df = pd.read_csv(doc)
        df['label'] = [df['label'][j][2:-2] for j,i in enumerate(df['label'])]
        df['label'] = df['label'].map(binary_Map)
        
        dfs.append(df)

    return dfs

In [5]:
def getFiles(organName, listDir, prefix=None):
    """
    Generate list of files associa-
    ted with each organ of interest
    in the list of files.
    --------------------------
    Input:
        organ_list: list of organs
                    str
        listDir:    list of files  
                    to search
        prefix:     OS path prefix
    Output:
        lists:      list of files 
                    associated with
                    organ
    """
    # get the files from listDir that contain 'organName'
    matching = [file for file in listDir if organName in file]
    
    # append the path prefix so that files are accessible
    if prefix:
        matchingFixed = [prefix + file for file in matching]
        return matchingFixed
    
    else:
        return matching

In [6]:
def multiclass_acc_and_f1(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds, )
    prec = precision_score(y_true=labels, y_pred=preds, )
    recall = recall_score(y_true=labels, y_pred=preds, )
    macro_f1 = f1_score(y_true=labels, y_pred=preds, average='macro')
    macro_weighted_f1 = f1_score(y_true=labels, y_pred=preds, average='weighted')
    macro_precision = precision_score(y_true=labels, y_pred=preds, average='macro')
    macro_weighted_precision = precision_score(y_true=labels, y_pred=preds, average='weighted')
    macro_recall = recall_score(y_true=labels, y_pred=preds, average='macro')
    macro_weighted_recall = recall_score(y_true=labels, y_pred=preds, average='weighted')
    micro_f1 = f1_score(y_true=labels, y_pred=preds, average='micro')
    confusion = confusion_matrix(y_true=labels, y_pred=preds)
    return {
        "acc": acc,
        "f1": f1,
        "precision": prec,
        "recall": recall,
        'micro_f1': micro_f1,
        "macro_f1": macro_f1,
        "macro_weighted_f1": macro_weighted_f1,
        "macro_precision": macro_precision,
        "macro_weighted_precision": macro_weighted_precision,
        "macro_recall": macro_recall,
        "macro_weighted_recall": macro_weighted_recall,
        "confusion_matrix": confusion,
    }

### Data

In [7]:
dataPath = '../data_200/recombined/'
# should be the list of all files
dataList = os.listdir(dataPath)
dataList = [dataPath + file for file in dataList]

In [8]:
# do task splittings here (all abnormal findings, all disease location, all indeterminate nods, all prev surge)
# model per classifier
abnormal_files = getFiles('Abnormal Findings', dataList)
abnormalDFs = cleanDocs(abnormal_files)
# all in one big csv
abnormalDFs = pd.concat(abnormalDFs)

In [9]:
prevsurgDFs = getFiles("Previous Surgeries", dataList)
prevsurgDFs = cleanDocs(prevsurgDFs)
# al in one big csv
prevsurgDFs = pd.concat(prevsurgDFs)

diseaselocDFs = getFiles("disease_location", dataList)
diseaselocDFs = cleanDocs(diseaselocDFs)
# all in one big csv
diseaselocDFs = pd.concat(diseaselocDFs)

In [10]:
names = ["Previous Surgeries Classifier", "Disease Location Classifier"]
all_ = [abnormalDFs, prevsurgDFs, diseaselocDFs]

In [25]:
abnormalDFs

Unnamed: 0,idx,sentence,label
0,5fddb3bdeaf0903792b63904,Gastrostomy tube in place. No evidence of co...,0
1,6046c8dfc7274cd918d2870b,Extensive multifocal areas of bowel wall thi...,0
2,5e46a9a85808eee774f693f0,Interval placement of duodenal stent in the d...,0
3,5f5b7067d8d9aa5228644e8e,Unremarkable \r\n Vasculature: Portal ve...,0
4,5e4659115808eee77438cd61,Unremarkable Pelvis: Unremarkable \r\n,0
...,...,...,...
195,5e45fdc15808eee7744b92a0,"For chest findings, please see the separatel...",0
196,5e45eb425808eee774049345,"For chest findings, please see the separatel...",0
197,5e4674955808eee774ea5d4e,Right upper lobe subpleural noncalcified pul...,1
198,5e4606e15808eee7748c7160,"For chest findings, please see the separatel...",0


### Training
- Splits should happen here
- How many models to generate?
- How to save the models?
    - Don't save everything, that may take a lot of space.
- Compute metrics so that you can assess the best performing models. 

In [11]:
# specifying training parameters
batch_size = 32
n_epochs = 3
learning_rate = 5e-5
warmup_steps = 0

In [12]:
# 3 different test sets, actual labels, predicted labels
test_sets = []
predicted_labels = []
actual_labels = []

# list of dictionaries of each model (should be 3)
all_results = []

In [13]:
# data splitting: all organs, one task?

for ind, frame in enumerate(all_[1:]):
    
    print("Training model...", names[ind])

    X = frame['sentence']
    y = frame['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    # determine class weights
    class_sample_count = np.array([len(np.where(y_train == t)[0]) for t in np.unique(y_train)])
    weight = 1. / class_sample_count
    samples_weight = np.array([weight[t] for t in y_train])
    samples_weight = torch.from_numpy(samples_weight)
    samples_weigth = samples_weight.double()
    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

    # encoding texts with tokenizer
    train_encodings = tokenizer(X_train.tolist(), max_length=512, truncation=True, padding=True)
    val_encodings = tokenizer(X_test.tolist(), max_length=512, truncation=True, padding=True)

    # create dataset from texts in dfs
    train_dataset = RadNotes(train_encodings, y_train.tolist())
    val_dataset = RadNotes(val_encodings, y_test.tolist())
    
    # training parameters are specified above

    # select appropriate device 
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')

    # now for model -- load
    model = AutoModelForSequenceClassification.from_pretrained('../model/UCSF BERT-500k+275k-pytorch/') 
    model.to(device) # model moved to available device
    model.train() # set to training mode

    train_loader = DataLoader(train_dataset, batch_size=batch_size,
                              sampler=sampler)
    num_steps = len(train_loader) // n_epochs #total training in batch div by num of epochs

    # optimization
    optim =AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(
                    optim, num_warmup_steps = warmup_steps, num_training_steps = num_steps
                 )

    # ---------------------------TRAINING------------------------------------
    for epoch in range(n_epochs): # for each available epoch
        for batch in train_loader: # for b in train_loader
            optim.zero_grad() # gradient @ zero?
            input_ids = batch['input_ids'].to(device) # keeping track of ids in device
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            # save loss from the outputs of the model
            loss=outputs[0]

            # now backprop
            loss.backward()

            optim.step()
            scheduler.step() # updating learning rate schedule 
            model.zero_grad()
    print("Finished training", names[ind])
    print("Now evaluating...")

    # ---------------------------EVALUATE MODEL------------------------------
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    model.eval()

    preds, all_labels = None, None

    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask = attention_mask)
        logits = outputs.logits

        if preds is None:
            preds = logits.detach().cpu().numpy()
            all_labels = batch['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            all_labels = np.append(all_labels, batch['labels'].detach().cpu().numpy(), axis=0)

    preds = np.argmax(preds, axis=1)
    print("Done! ... saving...")

    # make dataframe for all outputs (preds, labels, sentences)
    # save each model metric in a pandas dataframe (a list of dictionaries, which you later conver to df)
    # note that you should specify the model that worked best.
    # in the long run we should save the best model. 

    # result per csv split file
    result = multiclass_acc_and_f1(preds, all_labels)
    
    # model comparison
    all_results.append(result)
    
    # result compiling
    test_sets.append(X_test)
    actual_labels.append(all_labels)
    predicted_labels.append(preds)

Training model... Previous Surgeries Classifier


Some weights of the model checkpoint at ../model/UCSF BERT-500k+275k-pytorch/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificati

Finished training Previous Surgeries Classifier
Now evaluating...
Done! ... saving...
Training model... Disease Location Classifier


Some weights of the model checkpoint at ../model/UCSF BERT-500k+275k-pytorch/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificati

Finished training Disease Location Classifier
Now evaluating...
Done! ... saving...




In [15]:
all_results

[{'acc': 0.7549909255898367,
  'f1': 0.34146341463414637,
  'precision': 0.20833333333333334,
  'recall': 0.9459459459459459,
  'micro_f1': 0.7549909255898368,
  'macro_f1': 0.5954808711966718,
  'macro_weighted_f1': 0.8153834606346476,
  'macro_precision': 0.6015557006092254,
  'macro_weighted_precision': 0.9419678044034187,
  'macro_recall': 0.8435955410663583,
  'macro_weighted_recall': 0.7549909255898367,
  'confusion_matrix': array([[381, 133],
         [  2,  35]])},
 {'acc': 0.8690909090909091,
  'f1': 0.5909090909090909,
  'precision': 0.45217391304347826,
  'recall': 0.8524590163934426,
  'micro_f1': 0.8690909090909091,
  'macro_f1': 0.7564935064935066,
  'macro_weighted_f1': 0.8853482880755609,
  'macro_precision': 0.7157421289355322,
  'macro_weighted_precision': 0.9208461223933487,
  'macro_recall': 0.8618123302826108,
  'macro_weighted_recall': 0.8690909090909091,
  'confusion_matrix': array([[426,  63],
         [  9,  52]])}]

In [18]:
results_0616 = pd.DataFrame(all_results, index=names)
results_0616.to_csv("Classifier results_prevsurg_diseaseloc.csv")

In [19]:
results_0616

Unnamed: 0,acc,f1,precision,recall,micro_f1,macro_f1,macro_weighted_f1,macro_precision,macro_weighted_precision,macro_recall,macro_weighted_recall,confusion_matrix
Previous Surgeries Classifier,0.754991,0.341463,0.208333,0.945946,0.754991,0.595481,0.815383,0.601556,0.941968,0.843596,0.754991,"[[381, 133], [2, 35]]"
Disease Location Classifier,0.869091,0.590909,0.452174,0.852459,0.869091,0.756494,0.885348,0.715742,0.920846,0.861812,0.869091,"[[426, 63], [9, 52]]"


In [21]:
a = pd.read_csv("Classifier results.csv")
a

Unnamed: 0.1,Unnamed: 0,acc,f1,precision,recall,micro_f1,macro_f1,macro_weighted_f1,macro_precision,macro_weighted_precision,macro_recall,macro_weighted_recall,confusion_matrix
0,Abnormal Findings Classifier,0.827273,0.573991,0.771084,0.457143,0.827273,0.732834,0.810811,0.804172,0.820415,0.705401,0.827273,[[391 19]\n [ 76 64]]
1,Previous Surgeries Classifier,0.932849,0.0,0.0,0.0,0.932849,0.482629,0.900441,0.466425,0.870208,0.5,0.932849,[[514 0]\n [ 37 0]]
2,Disease Location Classifier,0.889091,0.0,0.0,0.0,0.889091,0.470645,0.836892,0.444545,0.790483,0.5,0.889091,[[489 0]\n [ 61 0]]


In [None]:
# saving metrics of all classifiers
df_res = pd.DataFrame(all_results, index=names)
os.mkdir("Results")
df_res.to_csv("Classifier results.csv")

Saving texts trained, predicted labels and actua labels

In [None]:
abfinds_res = pd.DataFrame(columns=['Text', 'Predicted', 'Actual'])
abfinds_res['Text'] = test_sets[0]
abfinds_res['Predicted'] = predicted_labels[0]
abfinds_res['Actual'] = actual_labels[0]
abfinds_res.to_csv('Abnormal_Findings_Results.csv')

In [24]:
prev_res = pd.DataFrame(columns=['Text', 'Predicted', 'Actual'])
prev_res['Text'] = test_sets[1]
prev_res['Predicted'] = predicted_labels[1]
prev_res['Actual'] = actual_labels[1]
prev_res.to_csv('Previous_surgeries_Results_weighted.csv')

disease_res = pd.DataFrame(columns=['Text', 'Predicted', 'Actual'])
disease_res['Text'] = test_sets[2]
disease_res['Predicted'] = predicted_labels[2]
disease_res['Actual'] = actual_labels[2]
disease_res.to_csv('Disease_location_Results_weighted.csv')