In [1]:
%cd "/content/drive/MyDrive/FinalProject/CheXbert/CheXbert-master/"

/content/drive/MyDrive/FinalProject/CheXbert/CheXbert-master


In [2]:
!ls

environment.yml  figures  LICENSE.pdf  README.md  requirements.txt  src


In [3]:
!pip install -r requirements.txt

Collecting numpy==1.18.2 (from -r requirements.txt (line 1))
  Downloading numpy-1.18.2.zip (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mPreparing metadata [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (pyproject.toml) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See 

In [3]:
%cd "./src"

/content/drive/MyDrive/FinalProject/CheXbert/CheXbert-master/src


In [4]:
import os
import argparse
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import utils
from models.bert_labeler import bert_labeler
from bert_tokenizer import tokenize
from transformers import BertTokenizer
from collections import OrderedDict
from datasets.unlabeled_dataset import UnlabeledDataset
from constants import *
from tqdm import tqdm

In [7]:
def collate_fn_no_labels(sample_list):
    """Custom collate function to pad reports in each batch to the max len,
       where the reports have no associated labels
    @param sample_list (List): A list of samples. Each sample is a dictionary with
                               keys 'imp', 'len' as returned by the __getitem__
                               function of ImpressionsDataset

    @returns batch (dictionary): A dictionary with keys 'imp' and 'len' but now
                                 'imp' is a tensor with padding and batch size as the
                                 first dimension. 'len' is a list of the length of
                                 each sequence in batch
    """
    tensor_list = [s['imp'] for s in sample_list]
    batched_imp = torch.nn.utils.rnn.pad_sequence(tensor_list,
                                                  batch_first=True,
                                                  padding_value=PAD_IDX)
    len_list = [s['len'] for s in sample_list]
    batch = {'imp': batched_imp, 'len': len_list}
    return batch

def load_unlabeled_data(csv_path, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS,
                        shuffle=False):
    """ Create UnlabeledDataset object for the input reports
    @param csv_path (string): path to csv file containing reports
    @param batch_size (int): the batch size. As per the BERT repository, the max batch size
                             that can fit on a TITAN XP is 6 if the max sequence length
                             is 512, which is our case. We have 3 TITAN XP's
    @param num_workers (int): how many worker processes to use to load data
    @param shuffle (bool): whether to shuffle the data or not

    @returns loader (dataloader): dataloader object for the reports
    """
    collate_fn = collate_fn_no_labels
    dset = UnlabeledDataset(csv_path)
    loader = torch.utils.data.DataLoader(dset, batch_size=batch_size, shuffle=shuffle,
                                         num_workers=num_workers, collate_fn=collate_fn)
    return loader

def label(checkpoint_path, csv_path):
    """Labels a dataset of reports
    @param checkpoint_path (string): location of saved model checkpoint
    @param csv_path (string): location of csv with reports

    @returns y_pred (List[List[int]]): Labels for each of the 14 conditions, per report
    """
    ld = load_unlabeled_data(csv_path)

    model = bert_labeler()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if torch.cuda.device_count() > 0: #works even if only 1 GPU available
        print("Using", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model) #to utilize multiple GPU's
        model = model.to(device)
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
    else:
        checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
        new_state_dict = OrderedDict()
        for k, v in checkpoint['model_state_dict'].items():
            name = k[7:] # remove `module.`
            new_state_dict[name] = v
        model.load_state_dict(new_state_dict)

    was_training = model.training
    model.eval()
    y_pred = [[] for _ in range(len(CONDITIONS))]
    chexbert_embeddings = []
    print("\nBegin report impression labeling. The progress bar counts the # of batches completed:")
    print("The batch size is %d" % BATCH_SIZE)
    with torch.no_grad():
        for i, data in enumerate(tqdm(ld)):
            batch = data['imp'] #(batch_size, max_len)
            batch = batch.to(device)
            src_len = data['len']
            batch_size = batch.shape[0]
            attn_mask = utils.generate_attention_masks(batch, src_len, device)

            out, cls_hid = model(batch, attn_mask)
            chexbert_embeddings.append(cls_hid)

            for j in range(len(out)):
                curr_y_pred = out[j].argmax(dim=1) #shape is (batch_size)
                y_pred[j].append(curr_y_pred)

        for j in range(len(y_pred)):
            y_pred[j] = torch.cat(y_pred[j], dim=0)
    if was_training:
        model.train()

    y_pred = [t.tolist() for t in y_pred]
    return y_pred, chexbert_embeddings

def save_preds(y_pred, csv_path, out_path):
    """Save predictions as out_path/labeled_reports.csv
    @param y_pred (List[List[int]]): list of predictions for each report
    @param csv_path (string): path to csv containing reports
    @param out_path (string): path to output directory
    """
    y_pred = np.array(y_pred)
    y_pred = y_pred.T

    df = pd.DataFrame(y_pred, columns=CONDITIONS)
    reports = pd.read_csv(csv_path)['Report Impression']

    df['Report Impression'] = reports.tolist()
    new_cols = ['Report Impression'] + CONDITIONS
    df = df[new_cols]

    df.replace(0, np.nan, inplace=True) #blank class is NaN
    df.replace(3, -1, inplace=True)     #uncertain class is -1
    df.replace(2, 0, inplace=True)      #negative class is 0

    df.to_csv(os.path.join(out_path, 'chexbert_labels_for_3re(mds)+zsi+llm_prompt2_for_images_1_500.csv'), index=False)

In [8]:
csv_path = "/content/drive/MyDrive/FinalProject/mock_data2/3re(mds)+zsi+llm_prompt2.csv"
checkpoint_path = "/content/drive/MyDrive/FinalProject/CheXbert/chexbert.pth"
out_path = "/content/drive/MyDrive/FinalProject/mock_data/"

In [9]:
y_pred, Semb = label(checkpoint_path, csv_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]


Tokenizing report impressions. All reports are cut off at 512 tokens.


100%|██████████| 500/500 [00:01<00:00, 424.98it/s]
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


Begin report impression labeling. The progress bar counts the # of batches completed:
The batch size is 18


100%|██████████| 28/28 [03:10<00:00,  6.81s/it]


In [10]:
len(Semb)

28

In [11]:
import torch

# Concatenate all tensors along the first axis
combined_semb_embeddings = torch.cat(Semb, dim=0)

# Check the shape of the combined tensor
print("Shape of combined embeddings:", combined_semb_embeddings.shape)

Shape of combined embeddings: torch.Size([500, 768])


In [12]:
# Define the file path
file_path = '/content/drive/MyDrive/FinalProject/mock_data/chexbert_embeddings_for_3re(mds)+zsi+llm_prompt2_for_images_1_500.pt'

# Save the tensor
torch.save(combined_semb_embeddings, file_path)

print(f"Tensor saved to {file_path}")

Tensor saved to /content/drive/MyDrive/FinalProject/mock_data/chexbert_embeddings_for_3re(mds)+zsi+llm_prompt2_for_images_1_500.pt


In [13]:
save_preds(y_pred, csv_path, out_path)

In [None]:
# if __name__ == '__main__':
#     parser = argparse.ArgumentParser(description='Label a csv file containing radiology reports')
#     parser.add_argument('-d', '--data', type=str, nargs='?', required=True,
#                         help='path to csv containing reports. The reports should be \
#                               under the \"Report Impression\" column')
#     parser.add_argument('-o', '--output_dir', type=str, nargs='?', required=True,
#                         help='path to intended output folder')
#     parser.add_argument('-c', '--checkpoint', type=str, nargs='?', required=True,
#                         help='path to the pytorch checkpoint')
#     args = parser.parse_args()
#     csv_path = args.data
#     out_path = args.output_dir
#     checkpoint_path = args.checkpoint