In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import ast

import re
import matplotlib.pyplot as plt
from skimage import io

import matplotlib.patches as patches

import cv2
from skimage.transform import resize
from skimage.util import img_as_ubyte
from skimage.io import imread
import torch
from sklearn.model_selection import train_test_split

EMBED_ROOT = Path("/vol/biomedic3/data/EMBED")
VINDR_ROOT = Path("/vol/biomedic3/data/VinDR-Mammo")

The following merged_dfcsv file can be recreated by running the merger_scripy.py file.

In [2]:
dicom = pd.read_csv("csv_files/merged_df.csv", low_memory=False)

vindr_findings = pd.read_csv(
    VINDR_ROOT / "finding_annotations.csv", low_memory=False
)

In [3]:
# XCCL shouldn't be converted to CC so manually editing it
dicom.loc[
    (dicom["SeriesDescription"] == "RXCCL") | (dicom["SeriesDescription"] == "LXCCL"),
    "ViewPosition",
] = "XCCL"

# Getting all rows with "ViewPosition" == Nan (but for which SeriesDescription is also not nan, as these are the ones subject to the data entry error)
view_nan = dicom.loc[
    (dicom.ViewPosition.isna()) & (dicom.SeriesDescription.isna() == False)
]

# Drop these rows from
dicom_no_nans = dicom[~dicom.index.isin(view_nan.index)]

view_nan["ViewPosition"] = view_nan["SeriesDescription"].apply(
    lambda x: "CC" if "CC" in x else ("MLO" if "MLO" in x else None)
)

dicom = pd.concat([dicom_no_nans, view_nan], axis=0, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  view_nan["ViewPosition"] = view_nan["SeriesDescription"].apply(


In [4]:
print(len(dicom))
# Remove any duplicated images
dicom = dicom.drop_duplicates(subset="anon_dicom_path")
# Remove spot compressed and magnified images
dicom = dicom[dicom.spot_mag.isna()]
# Remove invalid views
dicom = dicom[dicom.ViewPosition.isin(["CC", "MLO"])]
dicom = dicom[dicom.FinalImageType.isin(["2D"])]
# Remove images from male clients
dicom = dicom[dicom.PatientSex == "F"]
print(len(dicom))

508846
311738


In [5]:
# Remove any unnecessary fields from the DICOM imagewise dataframe (this may need to be updated in the future if other fields are deemed relevant)
dicom = dicom[
    [
        "empi_anon",
        "acc_anon",
        "image_path",
        "ViewPosition",
        "Manufacturer",
        "ManufacturerModelName",
        'ROI_coords',
        'num_roi',
        'PatientOrientation',
        'Rows',
        'Columns',
        'tissueden',
        'massshape',
        'path_severity',
        'asses',
        'side',
        'massmargin', 
        'massdens', 
        'calcfind',
        'calcdistri', 
        'calcnumber', 
        'otherfind', 
        'implanfind', 
        'numfind', 
    ]
]

vindr_findings = vindr_findings[
    [
        'study_id',
        'series_id',
        'image_id',
        'height',
        'width',
        'breast_birads',
        'breast_density',
        'finding_categories',
        'finding_birads',
        'xmin',
        'ymin',
        'xmax',
        'ymax'
    ]
]

In [6]:
vindr_findings['breast_birads'].value_counts()

breast_birads
BI-RADS 1    13406
BI-RADS 2     4676
BI-RADS 4     1005
BI-RADS 3      972
BI-RADS 5      427
Name: count, dtype: int64

In [7]:
vindr_findings['breast_density'].value_counts()

breast_density
DENSITY C    15695
DENSITY D     2717
DENSITY B     1973
DENSITY A      101
Name: count, dtype: int64

In [8]:
# CHECK BOUNDING BOX COUNTS
vindr_findings['finding_categories'].unique()
# Probably want to check if theres better ways to combine these since theyre quite repetitive
vindr_cat_counts = vindr_findings['finding_categories'].value_counts()
vindr_cat_counts

finding_categories
['No Finding']                                                                                      18232
['Mass']                                                                                             1123
['Suspicious Calcification']                                                                          402
['Focal Asymmetry']                                                                                   232
['Architectural Distortion']                                                                           95
['Asymmetry']                                                                                          90
['Suspicious Calcification', 'Mass']                                                                   82
['Suspicious Lymph Node']                                                                              57
['Skin Thickening']                                                                                    38
['Suspicious Calcification'

In [None]:
# Run this for multi-class filtering
def assign_label(finding_categories):
    if finding_categories == "['No Finding']":
        return 5
    elif finding_categories == "['Mass']":
        return 0
    elif finding_categories == "['Suspicious Calcification']":
        return 1
    elif finding_categories == "['Asymmetry']":
        return 2
    elif finding_categories == "['Architectural Distortion']":
        return 3
    else:
        return None
    
vindr_findings['label'] = vindr_findings['finding_categories'].apply(assign_label)

vindr_findings = vindr_findings.dropna(subset='label')
vindr_findings['label'] = vindr_findings['label'].astype(int)
vindr_findings

In [9]:
# Convert the 'finding_categories' column from string representation of list to actual list
vindr_findings['finding_categories'] = vindr_findings['finding_categories'].apply(eval)

# Combine 'xmin', 'ymin', 'xmax', 'ymax' 
vindr_findings['bbox'] = vindr_findings.apply(
    lambda row: [row['ymin'], row['xmin'], row['ymax'], row['xmax']] if pd.notnull(row[['xmin', 'ymin', 'xmax', 'ymax']]).any() else None,
    axis=1
)

# Drop the individual columns 'xmin', 'ymin', 'xmax', 'ymax'
vindr_findings.drop(['xmin', 'ymin', 'xmax', 'ymax'], axis=1, inplace=True)

Run the following cell to filter VinDr by Mass category, otherwise uncomment the last line to proceed with unfiltered data.

In [10]:
# Remove all bounding boxes that don't bound a mass
def contains_mass_or_suspicious_calcification(category_list):
    return 'Mass' in category_list 

def filter_finding_categories(df):
    #Remove unnecessary categories
    df = df[(df['finding_categories'].apply(contains_mass_or_suspicious_calcification)) | (df['finding_categories'].apply(lambda x: x == ['No Finding']))]
    #Remove findings with birads recorded as None
    df = df[(df['finding_birads'].notna()) | (df['finding_categories'].apply(lambda x: x == ['No Finding']))]
    return df

filtered_vindr = filter_finding_categories(vindr_findings)
# filtered_vindr = vindr_findings


In [13]:
# Find duplicate rows
duplicate_rows = filtered_vindr[filtered_vindr.duplicated(subset=['study_id', 'image_id'], keep=False)]

# Function to aggregate the relevant columns
def aggregate_columns(group):
    agg_dict = {}
    for col in ['finding_categories', 'finding_birads']:
        agg_dict[col] = group[col].apply(lambda x: [x] if not isinstance(x, list) else x).sum()
    # For 'bbox', ensure it is a list of lists
    agg_dict['bbox'] = group['bbox'].apply(lambda x: [x] if not isinstance(x, list) else x).tolist()
    # Uncomment the following line if using multi-class labeling
    # agg_dict['label'] = group['label'].apply(lambda x: x).tolist()

    # Include all other columns, taking the first value 
    for col in group.columns:
        if col not in agg_dict and col not in ['study_id', 'image_id']:
            agg_dict[col] = group[col].iloc[0]
    agg_dict['image_path'] = group['study_id'].iloc[0] + '/' + group['image_id'].iloc[0] + '.png'
    return pd.Series(agg_dict)

# Group duplicate rows by 'study_id' and 'image_id' and apply the aggregation function
collapsed_duplicates = duplicate_rows.groupby(['study_id', 'image_id']).apply(aggregate_columns).reset_index(drop=True)

# Find non-duplicate rows
non_duplicate_rows = filtered_vindr[~filtered_vindr.duplicated(subset=['study_id', 'image_id'], keep=False)].copy()

# Add 'image_path' to non-duplicate rows 
non_duplicate_rows.loc[:, 'image_path'] = non_duplicate_rows.apply(lambda row: row['study_id'] + '/' + row['image_id'] + '.png', axis=1)

# Remove 'study_id' and 'image_id' columns 
non_duplicate_rows = non_duplicate_rows.drop(columns=['study_id', 'image_id'])

# Combine collapsed duplicates with non-duplicate rows
vindr_final = pd.concat([collapsed_duplicates, non_duplicate_rows], ignore_index=True)

In [16]:
# Conversion dictionary to standardised naming of various fields in clincial metadata

# Human reader BIRADS density assessment
dens_conversion = {1.0: "A", 2.0: "B", 3.0: "C", 4.0: "D"}

# Remove cases from cases a valid BIRADS density assessment
df = dicom[dicom.tissueden.isin([1.0, 2.0, 3.0, 4.0])]
df.replace({"tissueden": dens_conversion}, inplace=True)
print(len(df))


274979


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace({"tissueden": dens_conversion}, inplace=True)


In [17]:
df['asses'].value_counts()

asses
N    205509
B     31547
A     26027
P      8658
S      2315
K       650
M       256
X        17
Name: count, dtype: int64

In [18]:
df['tissueden'].value_counts()

tissueden
B    117304
C    111797
A     31697
D     14181
Name: count, dtype: int64

In [20]:
# Logic implemented from https://github.com/Emory-HITI/EMBED_Open_Data/blob/main/Sample_Notebook.ipynb
# These variables are binary tags saying whether the finding is mass, asymetry, arch distortion or calcification
df_limited = df.loc[df.num_roi > 0]

df_limited['mass_finding'] = df_limited['massshape'].isin(['G', 'R', 'O', 'X', 'N', 'Y', 'D', 'L']) | (df_limited['massmargin'].isin(['D', 'U', 'M', 'I', 'S'])) | (df_limited['massdens'].isin(['+', '-', '='])).astype(int)
df_limited['assy_finding'] = df_limited['massshape'].isin(['T', 'B', 'S', 'F', 'V']).astype(int)
df_limited['arch_distortion_finding'] = df_limited['massshape'].isin(['Q', 'A']).astype(int)
df_limited['calcdistri'] = ((~df_limited['calcdistri'].isna()) | (~df_limited['calcfind'].isna()) | (df_limited['calcnumber'] != 0)).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_limited['mass_finding'] = df_limited['massshape'].isin(['G', 'R', 'O', 'X', 'N', 'Y', 'D', 'L']) | (df_limited['massmargin'].isin(['D', 'U', 'M', 'I', 'S'])) | (df_limited['massdens'].isin(['+', '-', '='])).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_limited['assy_finding'] = df_limited['massshape'].isin(['T', 'B', 'S', 'F', 'V']).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

In [21]:
# Calculate the sum across the specified columns for each row
df_limited[['massshape']].value_counts()

massshape
F            1318
S             945
O             400
Q             243
G             117
R              49
A              26
Y              21
X              18
V              10
T               4
B               2
Name: count, dtype: int64

In [22]:
df_limited['total_findingsinfo_perrow'] = df_limited[['mass_finding', 'assy_finding', 'arch_distortion_finding', 'calcdistri']].sum(axis=1)
# How many images without any finding description?
df_limited[['total_findingsinfo_perrow']].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_limited['total_findingsinfo_perrow'] = df_limited[['mass_finding', 'assy_finding', 'arch_distortion_finding', 'calcdistri']].sum(axis=1)


total_findingsinfo_perrow
1                            4018
0                             823
2                             133
3                               3
Name: count, dtype: int64

In [23]:
pd.crosstab(df_limited['total_findingsinfo_perrow'], df_limited['num_roi'])

num_roi,1,2,3,4
total_findingsinfo_perrow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,776,41,6,0
1,3623,359,36,0
2,118,14,0,1
3,3,0,0,0


In [24]:
# APPLY FILTERING TO EMBED DATA

df_limited = df_limited[df_limited['mass_finding'] > 0]

# Put embed data back together
df_no_roi = df.loc[df.num_roi == 0]
df = pd.concat([df_no_roi, df_limited])

In [25]:
# Rename columns for consistency 

vindr_final['numfind'] = vindr_final['bbox'].apply(lambda x: len(x) if x is not None else 0)

vindr_final['patient_id'] = vindr_final['series_id']
vindr_final = vindr_final.drop(columns=['series_id'])

df = df.drop(columns=['numfind'])
df['numfind'] = df['num_roi']
df = df.drop(columns=['num_roi'])

df['patient_id'] = df['empi_anon']
df = df.drop(columns=['empi_anon'])

df['height'] = df['Rows']
df['width'] = df['Columns']
df['bbox'] = df['ROI_coords']
df = df.drop(columns=['Rows', 'Columns', 'ROI_coords'])

In [None]:
# Execute this cell for multi-class labeling

# Assuming 'mass_finding', 'calcdistri', 'assy_finding', and 'arch_distortion_finding' are the columns in your dataframe
conditions = [
    df['mass_finding'] > 0,
    df['calcdistri'] > 0,
    df['assy_finding'] > 0,
    df['arch_distortion_finding'] > 0
]

choices = [0, 1, 2, 3]

df['label'] = np.select(conditions, choices, default=None)

In [29]:
# Uncomment label for multi-class

df = df[
    [
        'image_path',
        'patient_id',
        'height',
        'width',
        'bbox',
        'numfind',
        # 'label'
    ]
]

vindr_final = vindr_final[
    [
        'image_path',
        'patient_id',
        'height',
        'width',
        'bbox',
        'numfind',
        # 'label'
    ]
]

Function for rescaling bounding box coordinates to 1024x768.

In [30]:
def scale_and_flip_bounding_box(orig_coords, orig_height, orig_width, new_height=1024, new_width=768, flip_horizontal = False, flip_vertical = False):
    """
    Transform bounding box coords to fit on the rescaled DICOM image. 

    Bounding box outputs are returned in form [y1, x1, y2, x2], where [y1, x1] is at the top left
    (ie y1 < y2, and x1 < x2) of image.

    """
    height_decrease = new_height / orig_height
    width_decrease = new_width / orig_width

    scale_factor = min(height_decrease, width_decrease)

    # Rescale the box to be relative to full size image.
    coords = (orig_coords * scale_factor).astype("int").tolist()
    y1, x1, y2, x2 = coords
    # Reflect bbox co-ords based on horizontal or vertical flipping from original patient orientation
    # (Remember indexing starts from 0, so subtract 1 from geometric lengths).
    # Assume output transformation co-ords identical to original, and apply each separately
    y1_new, x1_new, y2_new, x2_new = coords
    # Single reflection will yield (y1 > y2) & (x1 > x2), scale by bbox width to get top-left and bottom-right coords
    bbox_width = abs(x2 - x1)
    if flip_horizontal:
        x1_new = new_width - 1 - x1 - bbox_width
        x2_new = new_width - 1 - x2 + bbox_width
    if flip_vertical:
        y1_new = new_height - 1 - y1 - bbox_width
        y2_new = new_height - 1 - y2 + bbox_width

    # Convert to bbox format xywh
    # y0, x0 = y1_new, x1_new
    # width = x2_new - x1_new
    # height = y2_new - y1_new

    return x1_new, y1_new, x2_new, y2_new

def convert_str_bbox_to_numpy(coords, embed):
    if embed:
        return np.array(ast.literal_eval(coords.replace('(', '[').replace(')', ']').replace(' ','')))
    else:
        return np.array(ast.literal_eval(coords.replace(' ','')))

def process_bboxes(bbox_str, image_height, image_width, embed=True):
    bboxes = convert_str_bbox_to_numpy(bbox_str, embed)
    processed_bboxes = []
    for bbox in bboxes:
        x1, y1, x2, y2 = scale_and_flip_bounding_box(bbox, image_height, image_width)
        processed_bboxes.append([x1, y1, x2, y2])
    return processed_bboxes


In [31]:
df_with_finds = df[df['numfind'] > 0]
df_no_finds = df[df['numfind'] == 0]
vindr_with_finds = vindr_final[vindr_final['numfind'] > 0]
vindr_no_finds = vindr_final[vindr_final['numfind'] == 0]

In [32]:
# Add label back in for multi-class

embdata_df = pd.DataFrame(columns=['image_path', 'patient_id', 'numfind', 'bbox', 'height', 'width']) #, 'label'])
vindr_df = pd.DataFrame(columns=['image_path', 'patient_id', 'numfind', 'bbox', 'height', 'width']) #, 'label'])

for _, row in df_with_finds.iterrows():
    
    try:
        bboxes = process_bboxes(row['bbox'], row['height'], row['width'])
    except:
        cleaned_bbox = row['bbox'].replace('[', '').replace(']', '')

    new_row = {
        'image_path': row['image_path'],
        'patient_id': row['patient_id'],
        'numfind': row['numfind'],
        'bbox': bboxes,
        'height': row['height'],
        'width': row['width'],
        # 'label': row['label']
    }

    embdata_df.loc[len(embdata_df)] = new_row


for _, row in vindr_with_finds.iterrows():
    
    try:
        bboxes = process_bboxes(str(row['bbox']), row['height'], row['width'], embed=False)
    except:
        bboxes = process_bboxes(str([row['bbox']]), row['height'], row['width'], embed=False)

    new_row = {
        'image_path': row['image_path'],
        'patient_id': row['patient_id'],
        'numfind': row['numfind'],
        'bbox': bboxes,
        'height': row['height'],
        'width': row['width'],
        # 'label': row['label'],
    }

    vindr_df.loc[len(vindr_df)] = new_row

In [33]:
df_no_finds['image_path'] = str(EMBED_ROOT) + '/' + 'images/png/1024x768/' + df_no_finds['image_path']
embdata_df['image_path'] = str(EMBED_ROOT) + '/' + 'images/png/1024x768/' + embdata_df['image_path']
vindr_no_finds['image_path'] = str(VINDR_ROOT) + '/' + 'pngs/' + vindr_no_finds['image_path']
vindr_df['image_path'] = str(VINDR_ROOT) + '/' + 'pngs/' + vindr_df['image_path']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_finds['image_path'] = str(EMBED_ROOT) + '/' + 'images/png/1024x768/' + df_no_finds['image_path']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vindr_no_finds['image_path'] = str(VINDR_ROOT) + '/' + 'pngs/' + vindr_no_finds['image_path']


In [35]:
# Run to ensure numfind value is correct
embdata_df['numfind'] = embdata_df['bbox'].apply(len)

In [36]:
# Set this value to desired negative-to-positive sample ratio
neg_sam_ratio = 1

sampled_embed_df = df_no_finds.sample(n=len(embdata_df)*neg_sam_ratio, random_state=42)
sampled_vindr_df = vindr_no_finds.sample(n=len(vindr_df)*neg_sam_ratio, random_state=42)

embed_data = pd.concat([sampled_embed_df, embdata_df])
vindr_data = pd.concat([sampled_vindr_df, vindr_df])

In [37]:
# Split data by patient ID
unique_patient_ids_embed = embed_data['patient_id'].unique()
unique_patient_ids_vindr = vindr_data['patient_id'].unique()

# train = 0.7, val = 0.2, test = 0.1
train_ids_embed, val_test_ids_embed = train_test_split(unique_patient_ids_embed, test_size=0.3, random_state=42)
val_ids_embed, test_ids_embed = train_test_split(val_test_ids_embed, test_size=0.33)
train_ids_vindr, val_test_ids_vindr = train_test_split(unique_patient_ids_vindr, test_size=0.3, random_state=42)
val_ids_vindr, test_ids_vindr = train_test_split(val_test_ids_vindr, test_size=0.33)

train_embed = embed_data[embed_data['patient_id'].isin(train_ids_embed)]
val_embed = embed_data[embed_data['patient_id'].isin(val_ids_embed)]
test_embed = embed_data[embed_data['patient_id'].isin(test_ids_embed)]

train_vindr = vindr_data[vindr_data['patient_id'].isin(train_ids_vindr)]
val_vindr= vindr_data[vindr_data['patient_id'].isin(val_ids_vindr)]
test_vindr = vindr_data[vindr_data['patient_id'].isin(test_ids_vindr)]

In [None]:
# Create csv files from our train, val, test dataframes
train_embed.to_csv('../retinanet/csv_files/unfiltered_embvind_1:1/train_em.csv', index=False)
val_embed.to_csv('../retinanet/csv_files/unfiltered_embvind_1:1/val_em.csv', index=False)
test_embed.to_csv('../retinanet/csv_files/unfiltered_embvind_1:1/test_em.csv', index=False)

train_vindr.to_csv('../retinanet/csv_files/unfiltered_embvind_1:1/train_em.csv', index=False)
val_vindr.to_csv('../retinanet/csv_files/unfiltered_embvind_1:1/val_em.csv', index=False)
test_vindr.to_csv('../retinanet/csv_files/unfiltered_embvind_1:1/test_em.csv', index=False)