In [None]:
# Notebook specific dependencies. Show details only if installing.
!pip install matplotlib tf-models-official==2.14.0 > /dev/null 2>&1 && echo "Installation successful" || (>&2 pip install matplotlib tf-models-official==2.14.0 > /dev/null)

Installation successful


In [None]:
from typing import Dict, Iterable
import tensorflow as tf
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
EMBEDDINGS_NPZ_FILE_PATH = '/content/drive/MyDrive/FinalProject/mock_data/cxr_img_embeddings_1_500.npz'
embeddings = np.load(EMBEDDINGS_NPZ_FILE_PATH)['embeddings']

In [None]:
embeddings.shape

(500, 32, 128)

In [None]:
# Reshape the array to have 500 rows and 4096 columns
reshaped_embeddings = embeddings.reshape(500, -1)
reshaped_embeddings.shape

(500, 4096)

In [None]:
# Create a DataFrame with a single column "embeddings"
df = pd.DataFrame({'embeddings': list(reshaped_embeddings)})

# Print the DataFrame to verify
print(df.head())

                                          embeddings
0  [-0.15371813, 0.013163331, -0.12165936, 0.0924...
1  [-0.15313834, -0.0148269255, -0.15015253, 0.07...
2  [-0.07716902, 0.017584063, -0.111682065, -0.00...
3  [-0.12306771, 0.03343663, -0.09574966, -0.0500...
4  [-0.11729785, -0.017323947, -0.1051671, 0.0825...


In [None]:
def create_tf_dataset_from_embeddings(
    embeddings: Iterable[np.ndarray],
    embeddings_size: int
) -> tf.data.Dataset:
    """Create a tf.data.Dataset from embeddings."""
    # Ensure embeddings are a list
    embeddings = list(embeddings)

    # Convert embeddings to np.float32 if necessary
    embeddings = [np.asarray(e, dtype=np.float32) for e in embeddings]

    # Check that the embeddings have the correct size
    assert all(e.size == embeddings_size for e in embeddings), \
        "All embeddings must have the size specified by embeddings_size"

    # Create dataset for embeddings
    ds_embeddings = tf.data.Dataset.from_tensor_slices(embeddings)

    return ds_embeddings

In [None]:
test_data = create_tf_dataset_from_embeddings(
    embeddings=df["embeddings"].values,
    embeddings_size=32 * 128)

In [None]:
len(test_data)

500

In [None]:
import tensorflow as tf
import tensorflow_models as tfm
import numpy as np
from typing import Dict, Iterable
from official.modeling.optimization import lars

def create_model(heads,
                 token_num,
                 embeddings_size,
                 learning_rate=0.1,
                 end_lr_factor=1.0,
                 dropout=0.0,
                 decay_steps=1000,
                 loss_weights=None,
                 hidden_layer_sizes=[512, 256],
                 weight_decay=0.0,
                 seed=None) -> tf.keras.Model:
  """
  Creates linear probe or multilayer perceptron using LARS + cosine decay.
  """
  inputs = tf.keras.Input(shape=(token_num * embeddings_size,))
  inputs_reshape = tf.keras.layers.Reshape((token_num, embeddings_size))(inputs)
  inputs_pooled = tf.keras.layers.GlobalAveragePooling1D(data_format='channels_last')(inputs_reshape)
  hidden = inputs_pooled
  # If no hidden_layer_sizes are provided, model will be a linear probe.
  for size in hidden_layer_sizes:
    hidden = tf.keras.layers.Dense(
        size,
        activation='relu',
        kernel_initializer=tf.keras.initializers.HeUniform(seed=seed),
        kernel_regularizer=tf.keras.regularizers.l2(l2=weight_decay),
        bias_regularizer=tf.keras.regularizers.l2(l2=weight_decay))(
            hidden)
    hidden = tf.keras.layers.BatchNormalization()(hidden)
    hidden = tf.keras.layers.Dropout(dropout, seed=seed)(hidden)
  # output = tf.keras.layers.Dense(
  #     units=len(heads),
  #     activation='sigmoid',
  #     kernel_initializer=tf.keras.initializers.HeUniform(seed=seed))(
  #         hidden)

  output = tf.keras.layers.Dense(
        units=1,  # Single head for binary classification
        activation='sigmoid',
        kernel_initializer=tf.keras.initializers.HeUniform(seed=seed)
    )(hidden)

  model = tf.keras.Model(inputs, output)
  learning_rate_fn = tf.keras.experimental.CosineDecay(
      tf.cast(learning_rate, tf.float32),
      tf.cast(decay_steps, tf.float32),
      alpha=tf.cast(end_lr_factor, tf.float32))
  model.compile(
      optimizer=tfm.optimization.lars.LARS(
          learning_rate=learning_rate_fn),
      loss='binary_crossentropy',
      weighted_metrics=[
        tf.keras.metrics.FalsePositives(),
        tf.keras.metrics.FalseNegatives(),
        tf.keras.metrics.TruePositives(),
        tf.keras.metrics.TrueNegatives(),
        tf.keras.metrics.AUC(),
        tf.keras.metrics.AUC(curve='PR', name='auc_pr')])
  return model

In [None]:
models_dir = "/content/drive/MyDrive/FinalProject/models/"
%cd {models_dir}

/content/drive/MyDrive/FinalProject/models


In [None]:
DIAGNOSIS = "PNEUMOTHORAX"
TOKEN_NUM = 32
EMBEDDINGS_SIZE = 128
# Load the model weights for inference
model = create_model(
    [DIAGNOSIS],
    token_num=TOKEN_NUM,
    embeddings_size = EMBEDDINGS_SIZE,
    )
model.load_weights('./pneumothorax_binary_classification.h5')

In [None]:
# @title Organize the output and display a sample of the predictions

rows = []

for embeddings in test_data.batch(1):
  row = {
      f'{DIAGNOSIS}_prediction': model(embeddings).numpy().flatten()[0],
  }
  rows.append(row)

eval_df = pd.DataFrame(rows)
eval_df.head()

Unnamed: 0,PNEUMOTHORAX_prediction
0,0.519106
1,0.505226
2,0.543802
3,0.523668
4,0.51997


Effusion: 0.4628321  
Airspace opacity: 0.54899263  
Cardiomegaly: 0.4625166    
Pneumothorax: 0.4998752   
Fracture: 0.47517264


In [None]:
# Define the threshold
threshold = 0.4998  # Set your desired threshold

# Apply the threshold to get binary predictions (yes/no)
eval_df['predicted_label'] = eval_df['PNEUMOTHORAX_prediction'].apply(lambda x: 1 if x >= threshold else 0)

In [None]:
eval_df.head()

Unnamed: 0,PNEUMOTHORAX_prediction,predicted_label
0,0.519106,1
1,0.505226,1
2,0.543802,1
3,0.523668,1
4,0.51997,1


In [None]:
len(eval_df[eval_df['predicted_label'] == 1])

266

In [None]:
fracture = eval_df['predicted_label'].values

In [None]:
pneumothorax = eval_df['predicted_label'].values

In [None]:
cardiomegaly = eval_df['predicted_label'].values

In [None]:
airspace = eval_df['predicted_label'].values

In [None]:
effusions = eval_df['predicted_label'].values

In [None]:
effusions

In [None]:
# Create a DataFrame from the lists
data = {
    'fracture': fracture,
    'pneumothorax': pneumothorax,
    'cardiomegaly': cardiomegaly,
    'airspace_opacity': airspace,
    'effusion': effusions
}
res_df = pd.DataFrame(data)

In [None]:
res_df.head()

Unnamed: 0,fracture,pneumothorax,cardiomegaly,airspace_opacity,effusion
0,0,1,0,1,1
1,1,1,0,1,1
2,1,1,0,1,0
3,1,1,0,1,1
4,0,1,0,1,1


In [None]:
# Save the DataFrame to a CSV file
csv_file_path = '/content/drive/MyDrive/FinalProject/mock_data/binary_classification_results.csv'
res_df.to_csv(csv_file_path, index=False)

In [None]:
features = []
for i in range(len(effusions)):
  img_features = []
  if effusions[i] == 1:
    img_features.append("presence of pleural effusion")
  else:
    img_features.append("no evidence of pleural effusion")

  if airspace[i] == 1:
    img_features.append("presence of airspace opacity")
  else:
    img_features.append("no evidence of airspace disease")

  if cardiomegaly[i] == 1:
    img_features.append("cardiomegaly")
  else:
    img_features.append("heart size is normal")

  if pneumothorax[i] == 1:
    img_features.append("pneumothorax is seen")
  else:
    img_features.append("no pneumothorax")

  if fracture[i] == 1:
    img_features.append("fracture is seen")
  else:
    img_features.append("no evidence of fracture")

  features.append(img_features)

In [None]:
import json

# Store the list on disk
with open('/content/drive/MyDrive/FinalProject/mock_data/binary_classification_clinical_features_for_images_1_500.json', 'w') as file:
    json.dump(features, file)

In [None]:
features

[['presence of pleural effusion',
  'presence of airspace opacity',
  'heart size is normal',
  'pneumothorax is seen',
  'no evidence of fracture'],
 ['presence of pleural effusion',
  'presence of airspace opacity',
  'heart size is normal',
  'pneumothorax is seen',
  'fracture is seen'],
 ['no evidence of pleural effusion',
  'presence of airspace opacity',
  'heart size is normal',
  'pneumothorax is seen',
  'fracture is seen'],
 ['presence of pleural effusion',
  'presence of airspace opacity',
  'heart size is normal',
  'pneumothorax is seen',
  'fracture is seen'],
 ['presence of pleural effusion',
  'presence of airspace opacity',
  'heart size is normal',
  'pneumothorax is seen',
  'no evidence of fracture'],
 ['presence of pleural effusion',
  'presence of airspace opacity',
  'heart size is normal',
  'no pneumothorax',
  'no evidence of fracture'],
 ['presence of pleural effusion',
  'no evidence of airspace disease',
  'cardiomegaly',
  'no pneumothorax',
  'fracture i