In [10]:
import os
import re
import pydicom
import numpy as np
from PIL import Image
import torch
from google.cloud import storage
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms
from io import BytesIO

# Set the environment variable for the Google Cloud credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'practicum-429909-522aa693029d.json'

# Initialize the Google Cloud Storage client
def initialize_storage_client():
    print("Initializing Google Cloud Storage client...")
    return storage.Client()


class ChestXRayDataset(Dataset):
    def __init__(self, dicom_blobs, report_blobs, transform=None):
        print("Creating dataset...")
        self.dicom_blobs = dicom_blobs
        self.report_blobs = report_blobs
        self.transform = transform
        self.client = initialize_storage_client()

        # Ensure DICOM and report files are sorted and matched
        self.dicom_blobs.sort(key=lambda x: x.name)
        self.report_blobs.sort(key=lambda x: x.name)

    def __len__(self):
        return len(self.dicom_blobs)

    def __getitem__(self, idx):
        dicom_blob = self.dicom_blobs[idx]
        dicom_data = dicom_blob.download_as_bytes()
        dicom_file = pydicom.dcmread(BytesIO(dicom_data))
        image_array = dicom_file.pixel_array
        image = Image.fromarray((image_array / np.max(image_array) * 255).astype(np.uint8)).convert('RGB')
        if self.transform:
            image = self.transform(image)

        report_blob = self.report_blobs[idx]
        report_data = report_blob.download_as_text()
        report = clean_text(report_data)

        return image, report

# Define the transformations for the images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to clean the text
def clean_text(text):
    text = re.sub(r'\n', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = text.strip()
    return text

# Function to list and sort DICOM and text files from multiple buckets
def list_and_sort_files(bucket_names, folder_name, file_extension, limit_per_bucket):
    client = initialize_storage_client()
    blobs = []

    for bucket_name in bucket_names:
        print(f"Listing files in bucket: {bucket_name} under folder: {folder_name}...")
        bucket = client.bucket(bucket_name)
        bucket_blobs = list(bucket.list_blobs(prefix=folder_name))
        filtered_blobs = [blob for blob in bucket_blobs if blob.name.endswith(file_extension)]
        filtered_blobs.sort(key=lambda x: x.name)  # Sorting the files by name
        blobs.extend(filtered_blobs[:limit_per_bucket])
        print(f"Found {len(filtered_blobs[:limit_per_bucket])} files in bucket: {bucket_name}")

    return blobs

# Define your bucket names
bucket_names = ['practicum_mimic', 'ronit_mimic', 'aarekh_bucket']  # Replace with your actual bucket names

# Define the limit per bucket
limit_per_bucket = 10000

# List and sort DICOM and text files from all buckets
dicom_blobs = list_and_sort_files(bucket_names, 'dcm_record/', '.dcm', limit_per_bucket)
report_blobs = list_and_sort_files(bucket_names, 'txt_record/', '.txt', limit_per_bucket)

# Ensure the lengths of DICOM and text files match
assert len(dicom_blobs) == len(report_blobs), "Mismatch between number of DICOM files and text reports"
print(f"Total DICOM files: {len(dicom_blobs)}, Total text reports: {len(report_blobs)}")

# Create the dataset
dataset = ChestXRayDataset(dicom_blobs=dicom_blobs, report_blobs=report_blobs, transform=transform)

# Define the split sizes
total_size = len(dataset)
train_size = int(0.8 * total_size)
val_size = int(0.12 * total_size)
test_size = total_size - train_size - val_size

# Split the dataset into train, val, and test sets
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
print(f"Dataset split into training set size: {train_size}, validation set size: {val_size}, and test set size: {test_size}")

# Create DataLoaders for training, validation, and testing
batch_size = 16  # Adjust based on available memory
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
print("DataLoaders for train, val, and test sets created.")

# Example usage of the dataloaders
for images, reports in train_dataloader:
    print(images.shape, reports)  # Print the batch of images and reports
    # Process the batch here


Initializing Google Cloud Storage client...
Listing files in bucket: practicum_mimic under folder: dcm_record/...
Found 1994 files in bucket: practicum_mimic
Listing files in bucket: ronit_mimic under folder: dcm_record/...
Found 9825 files in bucket: ronit_mimic
Listing files in bucket: aarekh_bucket under folder: dcm_record/...
Found 6303 files in bucket: aarekh_bucket
Initializing Google Cloud Storage client...
Listing files in bucket: practicum_mimic under folder: txt_record/...
Found 1994 files in bucket: practicum_mimic
Listing files in bucket: ronit_mimic under folder: txt_record/...
Found 9825 files in bucket: ronit_mimic
Listing files in bucket: aarekh_bucket under folder: txt_record/...
Found 6303 files in bucket: aarekh_bucket
Total DICOM files: 18122, Total text reports: 18122
Creating dataset...
Initializing Google Cloud Storage client...
Dataset split into training set size: 14497, validation set size: 2174, and test set size: 1451
DataLoaders for train, val, and test set

In [2]:
pip install pydicom

Collecting pydicom
  Downloading pydicom-2.4.4-py3-none-any.whl.metadata (7.8 kB)
Downloading pydicom-2.4.4-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.4.4
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install google-cloud-storage

Collecting google-cloud-storage
  Downloading google_cloud_storage-2.18.0-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting google-auth<3.0dev,>=2.26.1 (from google-cloud-storage)
  Downloading google_auth-2.32.0-py2.py3-none-any.whl.metadata (4.7 kB)
Collecting google-api-core<3.0.0dev,>=2.15.0 (from google-cloud-storage)
  Downloading google_api_core-2.19.1-py3-none-any.whl.metadata (2.7 kB)
Collecting google-cloud-core<3.0dev,>=2.3.0 (from google-cloud-storage)
  Downloading google_cloud_core-2.4.1-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting google-resumable-media>=2.6.0 (from google-cloud-storage)
  Downloading google_resumable_media-2.7.1-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting google-crc32c<2.0dev,>=1.0 (from google-cloud-storage)
  Downloading google_crc32c-1.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting googleapis-common-protos<2.0.dev0,>=1.56.2 (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage)
  Downloa

In [11]:
print("done")

done


In [14]:
import torch
from torch import nn, optim
from transformers import BertTokenizer, BertModel
from torchvision.models import resnet50
from torch.nn.utils.rnn import pad_sequence

class ALBEFModel(nn.Module):
    def __init__(self, image_model, text_model, embed_dim):
        super(ALBEFModel, self).__init__()
        self.image_model = image_model
        self.text_model = text_model
        self.image_proj = nn.Linear(image_model.fc.in_features, embed_dim)
        self.text_proj = nn.Linear(text_model.config.hidden_size, embed_dim)

    def forward(self, image, text_input_ids, text_attention_mask):
        image_features = self.image_model(image)
        image_embed = self.image_proj(image_features)

        text_output = self.text_model(input_ids=text_input_ids, attention_mask=text_attention_mask)
        text_features = text_output.last_hidden_state[:, 0, :]
        text_embed = self.text_proj(text_features)

        return image_embed, text_embed

image_model = resnet50(pretrained=True)
num_features = image_model.fc.in_features
image_model.fc = nn.Linear(num_features, num_features)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text_model = BertModel.from_pretrained('bert-base-uncased')

embed_dim = 512
model = ALBEFModel(image_model, text_model, embed_dim)


def contrastive_loss(image_embed, text_embed, temperature=0.1):
    similarity_matrix = torch.matmul(image_embed, text_embed.t()) / temperature
    labels = torch.arange(image_embed.size(0)).to(image_embed.device)
    loss_i = nn.CrossEntropyLoss()(similarity_matrix, labels)
    loss_t = nn.CrossEntropyLoss()(similarity_matrix.t(), labels)
    return (loss_i + loss_t) / 2


optimizer = optim.Adam(model.parameters(), lr=1e-4)
epochs = 10
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


patience = 3
min_delta = 0.001
best_val_loss = float('inf')
epochs_no_improve = 0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for images, reports in train_dataloader:
        images = images.to(device)
        tokenized_reports = [tokenizer(report, return_tensors='pt', padding=True, truncation=True, max_length=512) for report in reports]
        input_ids = pad_sequence([report['input_ids'].squeeze(0) for report in tokenized_reports], batch_first=True, padding_value=tokenizer.pad_token_id).to(device)
        attention_mask = pad_sequence([report['attention_mask'].squeeze(0) for report in tokenized_reports], batch_first=True, padding_value=0).to(device)

        optimizer.zero_grad()
        image_embed, text_embed = model(images, input_ids, attention_mask)
        loss = contrastive_loss(image_embed, text_embed)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss:.4f}")


    model.eval()
    val_loss = 0
    with torch.no_grad():
        for images, reports in val_dataloader:
            images = images.to(device)
            tokenized_reports = [tokenizer(report, return_tensors='pt', padding=True, truncation=True, max_length=512) for report in reports]
            input_ids = pad_sequence([report['input_ids'].squeeze(0) for report in tokenized_reports], batch_first=True, padding_value=tokenizer.pad_token_id).to(device)
            attention_mask = pad_sequence([report['attention_mask'].squeeze(0) for report in tokenized_reports], batch_first=True, padding_value=0).to(device)

            image_embed, text_embed = model(images, input_ids, attention_mask)
            loss = contrastive_loss(image_embed, text_embed)

            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Validation Loss: {avg_val_loss:.4f}")


    if avg_val_loss < best_val_loss - min_delta:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

print("Model training completed.")


model.eval()
test_loss = 0
with torch.no_grad():
    for images, reports in test_dataloader:
        images = images.to(device)
        tokenized_reports = [tokenizer(report, return_tensors='pt', padding=True, truncation=True, max_length=512) for report in reports]
        input_ids = pad_sequence([report['input_ids'].squeeze(0) for report in tokenized_reports], batch_first=True, padding_value=tokenizer.pad_token_id).to(device)
        attention_mask = pad_sequence([report['attention_mask'].squeeze(0) for report in tokenized_reports], batch_first=True, padding_value=0).to(device)

        image_embed, text_embed = model(images, input_ids, attention_mask)
        loss = contrastive_loss(image_embed, text_embed)

        test_loss += loss.item()

avg_test_loss = test_loss / len(test_dataloader)
print(f"Test Loss: {avg_test_loss:.4f}")

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /home/jovyan/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|█████████████████████████████████████████████████████████████████████████████| 97.8M/97.8M [00:00<00:00, 116MB/s]


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



Epoch 1/10, Training Loss: 3.1626
Epoch 1/10, Validation Loss: 2.2784
Epoch 2/10, Training Loss: 2.1097
Epoch 2/10, Validation Loss: 1.9610
Epoch 3/10, Training Loss: 1.7583
Epoch 3/10, Validation Loss: 1.8112
Epoch 4/10, Training Loss: 1.4713
Epoch 4/10, Validation Loss: 1.9049
Epoch 5/10, Training Loss: 1.2752
Epoch 5/10, Validation Loss: 2.2475
Epoch 6/10, Training Loss: 1.1187
Epoch 6/10, Validation Loss: 2.0869
Early stopping at epoch 6
Model training completed.
Test Loss: 2.0451


In [20]:
print("done")

done


In [13]:
pip install transformers

Collecting transformers
  Downloading transformers-4.43.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.24.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.5.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.43.2-py3-none-any.whl (9.4 MB)
[

In [15]:
# Save the trained model
torch.save(model.state_dict(), 'albef_model.pth')
print("Model saved.")

Model saved.


In [16]:
import pandas as pd


file_path = 'Radiopedia.xlsx'
radiopaedia_data = pd.read_excel(file_path)


def extract_triplets(data):
    triplets = []

    for index, row in data.iterrows():
        disease = row['Disease']
        presentation = row['presentation']
        description = row['description']
        conclusion = row.get('conclusion')


        if pd.notna(presentation):
            triplets.append((disease, 'has symptom', presentation))


        if pd.notna(description):
            triplets.append((disease, 'described by', description))


        if pd.notna(conclusion):
            triplets.append((disease, 'concludes with', conclusion))

    return triplets


triplets = extract_triplets(radiopaedia_data)


for triplet in triplets[:10]:
    print(triplet)


triplets_df = pd.DataFrame(triplets, columns=['Entity1', 'Relation', 'Entity2'])
triplets_df.to_csv('triplets.csv', index=False)

print("Triplet extraction completed.")


('Hair artifact', 'has symptom', 'Shortness of breath.')
('Hair artifact', 'described by', 'Heart size normal. Lungs clear. Pronounced elaborate hair artifact overlying the right supraclavicular fossa and shoulder.\n ')
('Large left upper lobe necrotic lung cancer', 'has symptom', 'Progressive shortness of breath.')
('Large left upper lobe necrotic lung cancer', 'described by', 'An 8 cm mass in the left upper lobe extends to the fissure and pleura peripherally. No chest wall invasion. No lymphadenopathy. Multiple rounded hypodense lesions in the slices of the liver at the bottom of the volume including a single calcified lesion.')
('Congenital lobar overinflation - left lung upper lobe', 'has symptom', 'Respiratory distress.')
('Congenital lobar overinflation - left lung upper lobe', 'described by', 'Left upper lobe hyperinflation and increased translucency deviating the mediastinal structures to the right, in keeping with congenital lobar hyperinflation.\xa0')
('Renal osteodystrophy',

In [18]:
import os
import numpy as np
import pandas as pd
import pydicom
from PIL import Image
from google.cloud import storage
from torchvision import transforms
from transformers import BertTokenizer, BertModel
import torch
from torch import nn
from torch.utils.data import DataLoader
from io import BytesIO


os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'practicum-429909-522aa693029d.json'
storage_client = storage.Client()


class ALBEFModel(nn.Module):
    def __init__(self, image_model, text_model, embed_dim):
        super(ALBEFModel, self).__init__()
        self.image_model = image_model
        self.text_model = text_model
        self.image_proj = nn.Linear(image_model.fc.in_features, embed_dim)
        self.text_proj = nn.Linear(text_model.config.hidden_size, embed_dim)

    def forward(self, image, text_input_ids, text_attention_mask):
        image_features = self.image_model(image)
        image_embed = self.image_proj(image_features)

        text_output = self.text_model(input_ids=text_input_ids, attention_mask=text_attention_mask)
        text_features = text_output.last_hidden_state[:, 0, :]  # Use the [CLS] token
        text_embed = self.text_proj(text_features)

        return image_embed, text_embed


image_model = resnet50(pretrained=True)
num_features = image_model.fc.in_features
image_model.fc = nn.Linear(num_features, num_features) 

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text_model = BertModel.from_pretrained('bert-base-uncased')

embed_dim = 512
model = ALBEFModel(image_model, text_model, embed_dim)


model.load_state_dict(torch.load('albef_model.pth'))
model.eval()


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


def generate_image_embeddings(dicom_data):
    dicom_file = pydicom.dcmread(BytesIO(dicom_data))
    image_array = dicom_file.pixel_array
    image = Image.fromarray((image_array / np.max(image_array) * 255).astype(np.uint8)).convert('RGB')
    image = image_transforms(image).unsqueeze(0).to(device)
    with torch.no_grad():
        image_embed, _ = model(image, torch.zeros((1, 512), dtype=torch.int64).to(device), torch.zeros((1, 512), dtype=torch.int64).to(device))
    return image_embed.cpu().numpy().flatten()


def generate_text_embeddings(text):
    tokenized_text = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    input_ids = tokenized_text['input_ids'].to(device)
    attention_mask = tokenized_text['attention_mask'].to(device)
    with torch.no_grad():
        _, text_embed = model(torch.zeros((1, 3, 224, 224)).to(device), input_ids, attention_mask)
    return text_embed.cpu().numpy().flatten()


def list_and_sort_files(bucket_name, folder_name, file_extension, limit):
    print(f"Listing files in bucket: {bucket_name} under folder: {folder_name}...")
    bucket = storage_client.bucket(bucket_name)
    bucket_blobs = list(bucket.list_blobs(prefix=folder_name))
    filtered_blobs = [blob for blob in bucket_blobs if blob.name.endswith(file_extension)]
    filtered_blobs.sort(key=lambda x: x.name)  # Sorting the files by name
    limited_blobs = filtered_blobs[:limit]
    print(f"Found {len(limited_blobs)} files in bucket: {bucket_name}")
    return limited_blobs


def process_dicom_files(bucket_names, folder_name, limits):
    image_embeddings = {}
    for bucket_name, limit in zip(bucket_names, limits):
        blobs = list_and_sort_files(bucket_name, folder_name, '.dcm', limit)
        for blob in blobs:
            dicom_data = blob.download_as_bytes()
            dicom_id = os.path.splitext(os.path.basename(blob.name))[0]
            image_embeddings[dicom_id] = generate_image_embeddings(dicom_data)
    return image_embeddings


def process_texts(bucket_names, folder_name, limits):
    raw_texts = {}
    for bucket_name, limit in zip(bucket_names, limits):
        blobs = list_and_sort_files(bucket_name, folder_name, '.txt', limit)
        for blob in blobs:
            text_data = blob.download_as_text()
            text_id = os.path.splitext(os.path.basename(blob.name))[0]
            raw_texts[text_id] = text_data
    return raw_texts


bucket_names = ['practicum_mimic', 'ronit_mimic', 'aarekh_bucket']
limits = [1994, 9825, 6303]


image_embeddings = process_dicom_files(bucket_names, 'dcm_record/', limits)
raw_texts = process_texts(bucket_names, 'txt_record/', limits)


def clean_text(text):
    text = re.sub(r'\n', ' ', text)  
    text = re.sub(r'\s+', ' ', text) 
    text = text.strip()
    return text


cleaned_texts = {text_id: clean_text(text) for text_id, text in raw_texts.items()}


text_embeddings = {text_id: generate_text_embeddings(text) for text_id, text in cleaned_texts.items()}

np.save('image_embeddings.npy', image_embeddings)
np.save('text_embeddings.npy', text_embeddings)

print("Embedding generation and saving completed.")




Listing files in bucket: practicum_mimic under folder: dcm_record/...
Found 1994 files in bucket: practicum_mimic
Listing files in bucket: ronit_mimic under folder: dcm_record/...
Found 9825 files in bucket: ronit_mimic
Listing files in bucket: aarekh_bucket under folder: dcm_record/...
Found 6303 files in bucket: aarekh_bucket
Listing files in bucket: practicum_mimic under folder: txt_record/...
Found 1994 files in bucket: practicum_mimic
Listing files in bucket: ronit_mimic under folder: txt_record/...
Found 9825 files in bucket: ronit_mimic
Listing files in bucket: aarekh_bucket under folder: txt_record/...
Found 6303 files in bucket: aarekh_bucket
Embedding generation and saving completed.


In [21]:
import os
import re
import pandas as pd
from google.cloud import storage


def list_and_sort_files(bucket_name, folder_name, file_extension):
    print(f"Listing files in bucket: {bucket_name} under folder: {folder_name}...")
    bucket = storage_client.bucket(bucket_name)
    bucket_blobs = list(bucket.list_blobs(prefix=folder_name))
    filtered_blobs = [blob for blob in bucket_blobs if blob.name.endswith(file_extension)]
    filtered_blobs.sort(key=lambda x: x.name)
    print(f"Found {len(filtered_blobs)} files in bucket: {bucket_name}")
    return filtered_blobs

def clean_text(text):
    text = re.sub(r'\n', ' ', text)  
    text = re.sub(r'\s+', ' ', text)  
    text = text.strip()
    return text

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'practicum-429909-522aa693029d.json'
storage_client = storage.Client()


bucket_names = ['practicum_mimic', 'ronit_mimic', 'aarekh_bucket']

all_dicom_blobs = []
for bucket_name in bucket_names:
    all_dicom_blobs.extend(list_and_sort_files(bucket_name, 'dcm_record/', '.dcm'))

image_names = [os.path.basename(blob.name) for blob in all_dicom_blobs]


all_text_blobs = []
for bucket_name in bucket_names:
    text_blobs = list_and_sort_files(bucket_name, 'txt_record/', '.txt')
    all_text_blobs.extend(text_blobs[:len(image_names)])

all_reports = []
for blob in all_text_blobs:
    text_data = blob.download_as_text()
    text_id = os.path.splitext(os.path.basename(blob.name))[0]
    cleaned_report = clean_text(text_data)
    all_reports.append({'report_id': text_id, 'cleaned_report': cleaned_report})

image_df = pd.DataFrame(image_names, columns=['image_id'])
reports_df = pd.DataFrame(all_reports)

if len(reports_df) > len(image_df):
    reports_df = reports_df.iloc[:len(image_df)]

all_reports_df_image = pd.concat([image_df, reports_df], axis=1)

all_reports_df_image.to_csv('all_reports_df_image.csv', index=False)

print(f"Total reports extracted: {len(all_reports_df_image)}")
print(all_reports_df_image.head())


Listing files in bucket: practicum_mimic under folder: dcm_record/...
Found 1994 files in bucket: practicum_mimic
Listing files in bucket: ronit_mimic under folder: dcm_record/...
Found 10460 files in bucket: ronit_mimic
Listing files in bucket: aarekh_bucket under folder: dcm_record/...
Found 6303 files in bucket: aarekh_bucket
Listing files in bucket: practicum_mimic under folder: txt_record/...
Found 1994 files in bucket: practicum_mimic
Listing files in bucket: ronit_mimic under folder: txt_record/...
Found 10460 files in bucket: ronit_mimic
Listing files in bucket: aarekh_bucket under folder: txt_record/...
Found 6303 files in bucket: aarekh_bucket
Total reports extracted: 18757
                                            image_id     report_id  \
0  01_02aa804e-bde0afdd-112c0b34-7bc16630-4e38401...  01_s50414267   
1  02_2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caa...  02_s53189527   
2  03_68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b47271...  03_s53911762   
3  04_ea030e7a-2e3b1346-bc5

In [23]:
import pandas as pd

triplets_path = 'triplets.csv'
triplets_df = pd.read_csv(triplets_path)

print(triplets_df.head())

triplets = list(triplets_df.itertuples(index=False, name=None))


def generate_triplet_embeddings(triplet):
    triplet_text = f"{triplet[0]} {triplet[1]} {triplet[2]}"
    return generate_text_embeddings(triplet_text)

triplet_embeddings = [generate_triplet_embeddings(triplet) for triplet in triplets]

                                             Entity1      Relation  \
0                                      Hair artifact   has symptom   
1                                      Hair artifact  described by   
2         Large left upper lobe necrotic lung cancer   has symptom   
3         Large left upper lobe necrotic lung cancer  described by   
4  Congenital lobar overinflation - left lung upp...   has symptom   

                                             Entity2  
0                               Shortness of breath.  
1  Heart size normal. Lungs clear. Pronounced ela...  
2                   Progressive shortness of breath.  
3  An 8 cm mass in the left upper lobe extends to...  
4                              Respiratory distress.  
