# DATASET CURATION - MASKED ROI PROJECT


**Objectives**:

To create the following groups:
1. **Positive group**: BIRADS 0 that became BIRADS 3, 4, 5, 6 in the subsequent diagnostic study
2. **Negative group**: BIRADS 1, 2 and BIRADS 0 that became BIRADS 1, 2 in the subsequent diagnostic study


## 1. Prep

In [None]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

from IPython.display import display

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 500)

In [7]:
def get_stats(df, suffix=None):
    """Provides a quick summary of a dataframe."""
    try:
        print(f"DF shape: {df.shape}")
        print(f"# Patients: {df.empi_anon.nunique()}")
        print(f"# Cases: {df.acc_anon.nunique()}\n")
        print(f"# Images: {df.anon_dicom_path.nunique()}\n")
    except Exception as e:
        print(e)

In [None]:
# EMBED
magview_path = "/content/EMBED_OpenData_clinical.csv"
metadata_path = "/content/EMBED_OpenData_metadata.csv"


metadata_full = pd.read_csv(metadata_path, dtype=str)
magview_full = pd.read_csv(magview_path, dtype=str)

In [None]:
# Selecting the following columns

meta_cols = [
    "empi_anon",
    "acc_anon",
    "ImageLateralityFinal",
    "ViewPosition",
    "study_date_anon",
    "FinalImageType",
    "anon_dicom_path",
    "png_path",
    "StudyDescription",
    "ProtocolName",
    "match_level",
    "num_roi",
    "ROI_coords",
    "BreastImplantPresent",
]

mag_cols = [
    "empi_anon",
    "acc_anon",
    "study_date_anon",
    "desc",
    "side",
    "asses",
    "path_severity",
    "bside",
    "procdate_anon",
    "pdate_anon",
]

In [None]:
metadata = metadata_full[meta_cols].copy()
magview = magview_full[mag_cols].copy()

In [None]:
metadata.study_date_anon = pd.to_datetime(metadata.study_date_anon, errors="coerce")
magview.study_date_anon = pd.to_datetime(magview.study_date_anon, errors="coerce")

In [None]:
# follow_up_period = metadata.groupby('empi_anon')['study_date_anon'].agg(['min', 'max'])
# follow_up_period['duration_years'] = (follow_up_period['max'] - follow_up_period['min']).dt.days / 365.25
# patients_with_5_years  = follow_up_period[follow_up_period['duration_years'] >= 4 ]

# Step 5: Get the patient IDs (empi_anon) that meet the 5-year criteria
# valid_patients = patients_with_5_years.index

# metadata  = metadata[metadata['empi_anon'].isin(valid_patients)].reset_index(drop=True)
# get_stats(metadata)

In [None]:
metadata["num_roi"] = metadata["num_roi"].fillna(0).astype(int)

metadata.num_roi = metadata.num_roi.astype(int)

## 2. METADATA: 2D MLO & CC

In [None]:
# EMBED 2D (MLO and CC)
meta_2d = metadata.loc[
    (metadata.FinalImageType == "2D") & (metadata.ViewPosition.isin(["MLO", "CC"]))
]
get_stats(meta_2d)

DF shape: (328961, 14)
# Patients: 22455
# Cases: 70861

# Images: 328961



In [None]:
def get_image_stats(df):
    """Provides a quick summary of the number of unique images and the ROIs."""
    temp_df = pd.merge(df, meta_2d, on=["empi_anon", "acc_anon"], how="left")
    temp_df = temp_df.loc[(temp_df.side == temp_df.ImageLateralityFinal)]
    temp_df.drop_duplicates(subset="png_path", inplace=True)
    print(f"# PNG PATH: {int(temp_df.png_path.nunique())}")
    print(f"# ROI: {int(temp_df.num_roi.sum())}")
    print(f"{temp_df.num_roi.value_counts()}")
    del temp_df

## 3. Screening

In [None]:
# SCREENING
screening_magview = magview.loc[magview.desc.str.contains("screen", case=False)].copy()
get_stats(screening_magview)

DF shape: (58888, 10)
# Patients: 20460
# Cases: 55956

'DataFrame' object has no attribute 'png_path'


### 3.1. Creating entries for the negative contralateral breast in bilateral examinations

```
MAGVIEW only has entries if a finding exists.

This means that if an exam is a bilateral exam and only one of the breast has a finding, the contralateral breast (negative) won't have an entry.

This would be problematic at the time when we need to merge with METADATA, because the contralateral breast would be excluded.

Therefore, we would need to create rows for the negative contralateral breast.
```

In [None]:
def get_exam_laterality(row):
    """A convenient function to get the exam laterality to be used with DF.apply() instead of iterating over each row."""
    if "bilat" in row.desc.lower():
        return "B"
    elif "left" in row.desc.lower():
        return "L"
    elif "right" in row.desc.lower():
        return "R"
    else:
        return None

In [None]:
# Applying the get_exam_laterality function
screening_magview["exam_laterality"] = screening_magview.apply(
    get_exam_laterality, axis=1
)

In [None]:
screening_magview.exam_laterality.value_counts(dropna=False)

Unnamed: 0_level_0,count
exam_laterality,Unnamed: 1_level_1
B,56558
L,1180
R,1150


In [None]:
screening_magview.side.value_counts(dropna=False)

Unnamed: 0_level_0,count
side,Unnamed: 1_level_1
,39661
L,8264
R,8082
B,2881


In [None]:
# side == nan --> B
screening_magview.side = screening_magview.side.fillna("B")

In [None]:
# create copy for assigning B to R
screening_magview_r = screening_magview.loc[screening_magview.side == "B"].copy()
screening_magview_r.side = screening_magview.side.str.replace("B", "R")

# assigning B to L
screening_magview.side = screening_magview.side.str.replace("B", "L")

# appending R and L
screening_magview = pd.concat([screening_magview, screening_magview_r])

In [None]:
print(screening_magview.side.value_counts(dropna=False))
print(screening_magview.shape)

side
L    50806
R    50624
Name: count, dtype: int64
(101430, 11)


In [None]:
screening_magview = screening_magview.sort_values(
    ["empi_anon", "acc_anon", "study_date_anon"]
).drop_duplicates()
screening_magview

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
31489,10000879,6992096043050201,2018-02-16,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
31489,10000879,6992096043050201,2018-02-16,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
10198,10009146,4190527469809995,2014-07-04,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
10198,10009146,4190527469809995,2014-07-04,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
20474,10015693,1334581155737139,2015-10-11,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,A,,,,,B
...,...,...,...,...,...,...,...,...,...,...,...
21119,99996622,9655172659462321,2016-06-04,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
25708,99999564,4369225803558884,2017-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
25708,99999564,4369225803558884,2017-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
31493,99999564,8832872399780580,2019-02-27,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B


In [None]:
exam_lat_b = screening_magview.loc[screening_magview.exam_laterality == "B"]
exam_lat_b.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
4944,74149616,9910167524120421,2014-09-28,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
50818,95378407,2439012505154554,2015-03-08,MG Screening Bilateral,R,N,,,,,B


In [None]:
# We want to aggregate all the sides for each bilateral exam so that we can filter those having only a single side.
exam_lat_b_agg = exam_lat_b.groupby("acc_anon")["side"].apply("".join).reset_index()
exam_lat_b_agg.sample(2)

Unnamed: 0,acc_anon,side
26037,5330330563068999,LR
15500,3567917562128811,LR


In [None]:
exam_lat_b_agg.side.value_counts()

Unnamed: 0_level_0,count
side,Unnamed: 1_level_1
LR,42769
L,4938
R,4882
RL,615
LL,123
RR,114
LLR,75
LRR,43
RLR,25
LLRR,20


In [None]:
exam_lat_b_side_r = exam_lat_b_agg.loc[~(exam_lat_b_agg.side.str.contains("L"))].copy()
exam_lat_b_side_l = exam_lat_b_agg.loc[~(exam_lat_b_agg.side.str.contains("R"))].copy()

In [None]:
screening_magview_right_to_left = (
    screening_magview.loc[screening_magview.acc_anon.isin(exam_lat_b_side_r.acc_anon)]
    .copy()
    .drop_duplicates()
)
screening_magview_left_to_right = (
    screening_magview.loc[screening_magview.acc_anon.isin(exam_lat_b_side_l.acc_anon)]
    .copy()
    .drop_duplicates()
)

In [None]:
# Creating the negative Left side
screening_magview_right_to_left.loc[
    screening_magview_right_to_left.side == "R", "side"
] = "L"
screening_magview_right_to_left.loc[
    screening_magview_right_to_left.side == "L", "asses"
] = "N"
screening_magview_right_to_left.loc[
    screening_magview_right_to_left.side == "L", "path_severity"
] = np.nan

screening_magview_right_to_left

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
37325,10033806,1069386741434572,2019-10-05,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
64534,10043985,1960584382049532,2018-04-18,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
59088,10043985,3613575521057039,2017-03-01,MG Screening Bilateral,L,N,,,,,B
42847,10043985,9492972692582499,2014-05-14,MG Screening Bilateral,L,N,,,,,B
8607,10065082,6346759651734606,2015-03-03,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
...,...,...,...,...,...,...,...,...,...,...,...
11436,99853035,2905584160156737,2015-02-15,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
33776,99853035,6677454260490853,2019-02-07,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
67642,99860105,6470240272862407,2018-03-19,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
12820,99871644,5176060292067455,2015-06-26,MG Screening Bilateral,L,N,,,,,B


In [None]:
# Creating the negative Right side
screening_magview_left_to_right.loc[
    screening_magview_left_to_right.side == "L", "side"
] = "R"
screening_magview_left_to_right.loc[
    screening_magview_left_to_right.side == "R", "asses"
] = "N"
screening_magview_left_to_right.loc[
    screening_magview_left_to_right.side == "R", "path_severity"
] = np.nan

screening_magview_left_to_right

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
20474,10015693,1334581155737139,2015-10-11,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
25070,10023113,5135241747022662,2016-10-05,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
63407,10029585,3189592535497441,2017-06-11,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
62320,10042753,1955284757719450,2017-06-15,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
37004,10044241,3993319361430024,2019-07-27,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
...,...,...,...,...,...,...,...,...,...,...,...
80934,99881569,1140879824262422,2021-01-05,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
43901,99881569,3921887412575009,2013-11-02,MG Screening Bilateral w/CAD,R,N,,,,,B
9588,99908618,9288525074493489,2014-08-14,MG Screening Bilateral w/CAD,R,N,,L,2014-08-26,2014-08-27 00:00:00,B
6519,99957941,2224428804635608,2014-06-03,MG Screening Bilateral,R,N,,,,,B


In [None]:
# Merging the original and the two negative contralaterals
screening_magview_with_contralat = (
    pd.concat(
        [
            screening_magview,
            screening_magview_left_to_right,
            screening_magview_right_to_left,
        ]
    )
    .sort_values(["empi_anon", "acc_anon", "study_date_anon"])
    .drop_duplicates()
)
screening_magview_with_contralat.sample(2)
screening_magview_with_contralat.to_csv(
    "/content/EMBED_OpenData_magview_with_controlateral.csv", index=False
)

In [None]:
get_stats(meta_2d)

DF shape: (328961, 14)
# Patients: 22455
# Cases: 70861

# Images: 328961



In [None]:
get_stats(screening_magview_with_contralat)
display(screening_magview_with_contralat)

DF shape: (110396, 11)
# Patients: 20460
# Cases: 55956

'DataFrame' object has no attribute 'png_path'


Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality
31489,10000879,6992096043050201,2018-02-16,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
31489,10000879,6992096043050201,2018-02-16,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
10198,10009146,4190527469809995,2014-07-04,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
10198,10009146,4190527469809995,2014-07-04,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
20474,10015693,1334581155737139,2015-10-11,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,A,,,,,B
...,...,...,...,...,...,...,...,...,...,...,...
21119,99996622,9655172659462321,2016-06-04,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
25708,99999564,4369225803558884,2017-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B
25708,99999564,4369225803558884,2017-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B
31493,99999564,8832872399780580,2019-02-27,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B


In [None]:
get_image_stats(screening_magview_with_contralat)

# PNG PATH: 258269
# ROI: 5255
num_roi
0.0    253498
1.0      4330
2.0       399
3.0        41
4.0         1
Name: count, dtype: int64


### 3.2. BIRADS 0

In [None]:
b0 = screening_magview_with_contralat.loc[
    screening_magview_with_contralat.asses.isin(["A"])
]

get_stats(b0)
get_image_stats(b0)

DF shape: (10876, 11)
# Patients: 7747
# Cases: 8829

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 24387
# ROI: 5131
num_roi
0.0    19725
1.0     4232
2.0      392
3.0       37
4.0        1
Name: count, dtype: int64


### 3.3. BIRADS 1, 2

In [None]:
b12 = screening_magview_with_contralat.loc[
    screening_magview_with_contralat.asses.isin(["B", "N"])
]

get_stats(b12)
get_image_stats(b12)

DF shape: (99482, 11)
# Patients: 19665
# Cases: 54081

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 234362
# ROI: 192
num_roi
0.0    234192
1.0       153
2.0        12
3.0         5
Name: count, dtype: int64


## 4. Diagnostic

In [None]:
diag_magview = magview.loc[magview.desc.str.contains("diag", case=False)]

get_stats(diag_magview)
print()
print(f"Asses Counts:\n{diag_magview.asses.value_counts()}")

DF shape: (22888, 10)
# Patients: 9656
# Cases: 16814

'DataFrame' object has no attribute 'png_path'

Asses Counts:
asses
B    8794
P    5563
N    4193
S    3063
A     580
K     386
M     284
X      25
Name: count, dtype: int64


## 5. Screening BIRADS 0 and Diagnostic

In [None]:
b0_dx = pd.merge(b0, diag_magview, on="empi_anon", suffixes=[None, "_dx"])
b0_dx = b0_dx.loc[
    (b0_dx.side == b0_dx.side_dx) | (b0_dx.side_dx == "B") | (b0_dx.side_dx.isna())
]

In [None]:
# Getting only subsequent diagnostic studies within 3 months
b0_dx["delta_date_dx"] = (b0_dx.study_date_anon_dx - b0_dx.study_date_anon).dt.days
b0_dx_3mo = b0_dx.loc[b0_dx.delta_date_dx.isin(range(0, 91))]
b0_dx_3mo.sample(1)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx
14786,74214316,1279925568980544,2013-03-27,MG Screening Bilateral w/CAD,L,A,2.0,L,2013-04-17,2013-04-17 00:00:00,B,5095691490562933,2013-04-11,MG Diagnostic Left,L,S,2.0,L,2013-04-17,2013-04-17 00:00:00,15


### 5.1. BIRADS 0 (Screening) --> BIRADS 1, 2 (Diagnostic)

In [None]:
b0_12dx = b0_dx_3mo.loc[b0_dx_3mo.asses_dx.isin(["N", "B"])].copy()
get_stats(b0_12dx)
get_image_stats(b0_12dx)

DF shape: (3755, 21)
# Patients: 2924
# Cases: 3169

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 7695
# ROI: 1676
num_roi
0.0    6212
1.0    1306
2.0     162
3.0      14
4.0       1
Name: count, dtype: int64


### 5.2. BIRADS 0 (Screening) --> BIRADS 3, 4, 5, 6 (Diagnostic)

In [None]:
b0_3456dx = magview.loc[
    magview.asses.isin(["K"]) | magview.path_severity.isin([0, 1])
].copy()

get_stats(b0_3456dx)
get_image_stats(b0_3456dx)
display(b0_3456dx)

DF shape: (395, 10)
# Patients: 216
# Cases: 313

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 806
# ROI: 9
num_roi
0.0    798
1.0      7
2.0      1
Name: count, dtype: int64


Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon
248,41737961,3103295389430403,2013-01-10,MG Diagnostic Mammo Bilateral,R,K,,,,
249,41737961,3103295389430403,2013-01-10,MG Diagnostic Mammo Bilateral,R,K,,,,
787,82818555,9133832708678084,2013-11-10,MG Diagnostic Mammo Bilateral,R,K,,,,
1151,12628486,8298312705312234,2013-09-05,MG Diagnostic Right,R,K,0.0,R,2013-10-10,2013-10-10 00:00:00
1372,75428728,3608864100750486,2014-01-30,MG Diagnostic Left,L,K,,,,
...,...,...,...,...,...,...,...,...,...,...
81451,87877516,2183052495313832,2021-01-09,MG Diagnostic Bilateral w/Tomo/CAD,L,K,0.0,L,2021-02-15,2021-02-22 00:00:00
81452,87877516,2183052495313832,2021-01-09,MG Diagnostic Bilateral w/Tomo/CAD,L,K,0.0,L,2021-02-15,2021-02-22 00:00:00
81453,87877516,2183052495313832,2021-01-09,MG Diagnostic Bilateral w/Tomo/CAD,L,K,0.0,L,2021-02-15,2021-02-22 00:00:00
81558,99703607,1965937622916299,2019-12-08,MG Diagnostic Right w/CAD,R,K,0.0,R,2019-12-29,2020-01-01 00:00:00


## 6. Negative group

In [None]:
# Negative group = BIRADS_12 + BIRADS_0_12dx
neg_group = pd.concat([b12, b0_12dx])
neg_group.drop_duplicates(inplace=True)

get_stats(neg_group)
get_image_stats(neg_group)

DF shape: (103023, 21)
# Patients: 19877
# Cases: 54639

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 241885
# ROI: 1860
num_roi
0.0    240240
1.0      1451
2.0       174
3.0        19
4.0         1
Name: count, dtype: int64


In [None]:
# Include only ones with negative follow-up after 1 year
neg_group_b12 = pd.merge(neg_group, b12, on=["empi_anon"], suffixes=(None, "_1yrfu"))

neg_group_b12 = neg_group_b12.loc[(neg_group_b12.side == neg_group_b12.side_1yrfu)]

neg_group_b12["delta_date_1yrfu"] = (
    neg_group_b12.study_date_anon_1yrfu - neg_group_b12.study_date_anon
).dt.days

get_stats(neg_group_b12)
get_image_stats(neg_group_b12)

neg_group_b12.sample(2)

DF shape: (419008, 32)
# Patients: 19665
# Cases: 54427

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 239895
# ROI: 1287
num_roi
0.0    238741
1.0      1034
2.0       108
3.0        11
4.0         1
Name: count, dtype: int64


Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu
389836,53477169,4444852259333890,2017-07-18,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,,NaT,,,,,,,,,8036531341828254,2019-01-02,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,533
679030,85642901,5319540478526708,2019-10-09,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,,NaT,,,,,,,,,4564119800495999,2015-05-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,-1598


In [None]:
neg_group_1yrfu = neg_group_b12.loc[(neg_group_b12.delta_date_1yrfu > 360)]
get_stats(neg_group_1yrfu)
get_image_stats(neg_group_1yrfu)

DF shape: (158091, 32)
# Patients: 11590
# Cases: 34180

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 150887
# ROI: 690
num_roi
0.0    150266
1.0       563
2.0        48
3.0         9
4.0         1
Name: count, dtype: int64


In [None]:
neg_group_1yrfu_first_study = neg_group_1yrfu.sort_values(
    ["empi_anon", "acc_anon", "study_date_anon_1yrfu"]
).drop_duplicates(subset=["acc_anon", "side"])  # to only get the first followup study
get_stats(neg_group_1yrfu_first_study)
get_image_stats(neg_group_1yrfu_first_study)

DF shape: (63835, 32)
# Patients: 11590
# Cases: 34180

'DataFrame' object has no attribute 'png_path'
# PNG PATH: 150887
# ROI: 690
num_roi
0.0    150266
1.0       563
2.0        48
3.0         9
4.0         1
Name: count, dtype: int64


In [None]:
neg_group_1yrfu_first_study.path_severity.value_counts()

Unnamed: 0_level_0,count
path_severity,Unnamed: 1_level_1
4.0,25
2.0,7
0.0,4
3.0,1


In [None]:
# Exclude any patient with any biopsy result
neg_group_1yrfu_first_study_no_biopsy = neg_group_1yrfu_first_study.loc[
    neg_group_1yrfu_first_study.path_severity.isna()
].copy()

In [None]:
# Merging with METADATA to get the images
neg_group_1yrfu_first_study_no_biopsy_images = pd.merge(
    neg_group_1yrfu_first_study_no_biopsy,
    meta_2d,
    on=["empi_anon", "acc_anon", "study_date_anon"],
)
neg_group_1yrfu_first_study_no_biopsy_images = (
    neg_group_1yrfu_first_study_no_biopsy_images.loc[
        (
            neg_group_1yrfu_first_study_no_biopsy_images.side
            == neg_group_1yrfu_first_study_no_biopsy_images.ImageLateralityFinal
        )
    ]
)
neg_group_1yrfu_first_study_no_biopsy_images.drop_duplicates(
    subset="png_path", inplace=True
)
get_stats(neg_group_1yrfu_first_study_no_biopsy_images)

DF shape: (149727, 43)
# Patients: 11243
# Cases: 33180

# Images: 149727



In [None]:
print(f"ROIs = {neg_group_1yrfu_first_study_no_biopsy_images.num_roi.sum()}")
print(neg_group_1yrfu_first_study_no_biopsy_images.num_roi.value_counts())

ROIs = 678
num_roi
0    149116
1       555
2        46
3         9
4         1
Name: count, dtype: int64


## 7. Positive Group

In [None]:
pos_group_images = pd.merge(
    b0_3456dx, meta_2d, on=["empi_anon", "acc_anon", "study_date_anon"]
)
pos_group_images = pos_group_images.loc[
    (pos_group_images.side == pos_group_images.ImageLateralityFinal)
]
pos_group_images.drop_duplicates(subset="anon_dicom_path", inplace=True)
get_stats(pos_group_images)
display(pos_group_images)

DF shape: (806, 21)
# Patients: 209
# Cases: 303

# Images: 806



Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,ImageLateralityFinal,ViewPosition,FinalImageType,anon_dicom_path,png_path,StudyDescription,ProtocolName,match_level,num_roi,ROI_coords,BreastImplantPresent
1,41737961,3103295389430403,2013-01-10,MG Diagnostic Mammo Bilateral,R,K,,,,,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/41737961/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Diagnostic Mammo Bilateral,RCC,[],0,(),NO
4,41737961,3103295389430403,2013-01-10,MG Diagnostic Mammo Bilateral,R,K,,,,,R,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/41737961/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Diagnostic Mammo Bilateral,RMLO,[],0,(),NO
5,41737961,3103295389430403,2013-01-10,MG Diagnostic Mammo Bilateral,R,K,,,,,R,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/41737961/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Diagnostic Mammo Bilateral,RMLO,[],0,(),NO
12,82818555,9133832708678084,2013-11-10,MG Diagnostic Mammo Bilateral,R,K,,,,,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/82818555/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Diagnostic Mammo Bilateral,RCC,[],0,(),NO
13,82818555,9133832708678084,2013-11-10,MG Diagnostic Mammo Bilateral,R,K,,,,,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/82818555/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Diagnostic Mammo Bilateral,RMCC,[],0,(),NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1492,87877516,2183052495313832,2021-01-09,MG Diagnostic Bilateral w/Tomo/CAD,L,K,0.0,L,2021-02-15,2021-02-22 00:00:00,L,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_2/87877516/1...,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Diagnostic Bilateral w/Tomo/CAD,L MLO ComboHD,[],0,(),NO
1509,99703607,1965937622916299,2019-12-08,MG Diagnostic Right w/CAD,R,K,0.0,R,2019-12-29,2020-01-01 00:00:00,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_2/99703607/1...,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Diagnostic Right w/CAD,R CC,[],0,(),NO
1510,99703607,1965937622916299,2019-12-08,MG Diagnostic Right w/CAD,R,K,0.0,R,2019-12-29,2020-01-01 00:00:00,R,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_2/99703607/1...,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Diagnostic Right w/CAD,R MLO,[],0,(),NO
1511,85936969,4786838232916669,2019-09-10,MG Diagnostic Right w/CAD,R,K,0.0,R,2019-10-11,2019-10-15 00:00:00,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_2/85936969/1...,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Diagnostic Right w/CAD,3D_ROUTINE+2D_ROUTINE,[],0,(),NO


In [None]:
print(f"ROIs  = {pos_group_images.num_roi.sum()}")
print(pos_group_images.num_roi.value_counts())

ROIs  = 9
num_roi
0    798
1      7
2      1
Name: count, dtype: int64


## 8. Excluding Images from the Negative Group that are found in the Positive Group using acc_anon and side

In [None]:
# Merge negatives and positive groups
neg_pos = pd.merge(
    neg_group_1yrfu_first_study_no_biopsy_images,
    pos_group_images,
    on=["empi_anon", "acc_anon", "side"],
    suffixes=["_neg", "_pos"],
)

In [None]:
# Create new KeyID of acc_anon + side on negative group and negative+positive group
neg_pos["acc_anon_side"] = neg_pos.acc_anon + neg_pos.side

In [None]:
neg_group_1yrfu_first_study_no_biopsy_images["acc_anon_side"] = (
    neg_group_1yrfu_first_study_no_biopsy_images.acc_anon
    + neg_group_1yrfu_first_study_no_biopsy_images.side
)
neg_group_1yrfu_first_study_no_biopsy_images.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu,ImageLateralityFinal,ViewPosition,FinalImageType,anon_dicom_path,png_path,StudyDescription,ProtocolName,match_level,num_roi,ROI_coords,BreastImplantPresent,acc_anon_side
191619,68257398,6851799473747062,2017-04-04,MG Screening Right w/Tomo/CAD,R,N,,,,,R,,NaT,,,,,,,,,2344311523037299,2018-11-03,MG Screening Right w/Tomo/CAD,R,N,,,,,R,578,R,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/68257398/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screening Right w/Tomo/CAD,R MLO ComboHD,[],0,(),NO,6851799473747062R
103681,41789598,7175903704021652,2019-02-15,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,,NaT,,,,,,,,,4013836104088884,2020-05-01,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,441,R,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/41789598/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R MLO ComboHD,[],0,(),NO,7175903704021652R


In [None]:
# Removing any images that are found in the positive group from the negative group using the created KeyID (acc_anon+side)
neg_group_final = neg_group_1yrfu_first_study_no_biopsy_images.loc[
    ~neg_group_1yrfu_first_study_no_biopsy_images.acc_anon_side.isin(
        neg_pos.acc_anon_side
    )
]
neg_group_final.sample(2)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu,ImageLateralityFinal,ViewPosition,FinalImageType,anon_dicom_path,png_path,StudyDescription,ProtocolName,match_level,num_roi,ROI_coords,BreastImplantPresent,acc_anon_side
63891,29634697,2990999232173615,2016-07-22,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,,NaT,,,,,,,,,6070072192773336,2017-07-29,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,372,L,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/29634697/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L MLO ComboHD,[],0,(),NO,2990999232173615L
109261,43449548,2461020309035025,2016-12-09,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,R,2016-12-26,2016-12-31 00:00:00,B,,NaT,,,,,,,,,8862669239623990,2018-07-28,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,596,L,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/43449548/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L CC ComboHD,[],0,(),NO,2461020309035025L


In [None]:
get_stats(neg_group_final)

print(f"ROIs  = {neg_group_final.num_roi.sum()}")
print(neg_group_final.num_roi.value_counts())

DF shape: (149727, 44)
# Patients: 11243
# Cases: 33180

# Images: 149727

ROIs  = 678
num_roi
0    149116
1       555
2        46
3         9
4         1
Name: count, dtype: int64


## 9. Saving and Exporting

In [None]:
columns_to_save = [
    "empi_anon",
    "acc_anon",
    "anon_dicom_path",
    "desc",
    "ProtocolName",
    "asses",
    "path_severity",
    "study_date_anon",
    "side",
    "ImageLateralityFinal",
    "bside",
    "ViewPosition",
    "match_level",
    "num_roi",
    "ROI_coords",
]

In [None]:
get_stats(neg_group_final)
get_stats(pos_group_images)
has_overlap = neg_group_final["empi_anon"].isin(pos_group_images["empi_anon"]).any()
get_stats(pos_group_images)
if has_overlap:
    print("There are common patients.")
neg_group_final_filtered = neg_group_final[
    ~neg_group_final["empi_anon"].isin(pos_group_images["empi_anon"])
]
get_stats(neg_group_final_filtered)
get_stats(pos_group_images)
display(pos_group_images)

DF shape: (149727, 44)
# Patients: 11243
# Cases: 33180

# Images: 149727

DF shape: (806, 21)
# Patients: 209
# Cases: 303

# Images: 806

DF shape: (806, 21)
# Patients: 209
# Cases: 303

# Images: 806

There are common patients.
DF shape: (149438, 44)
# Patients: 11204
# Cases: 33094

# Images: 149438

DF shape: (806, 21)
# Patients: 209
# Cases: 303

# Images: 806



Unnamed: 0,empi_anon,acc_anon,cancer_diagnosis_date,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,ImageLateralityFinal,ViewPosition,FinalImageType,anon_dicom_path,png_path,StudyDescription,ProtocolName,match_level,num_roi,ROI_coords,BreastImplantPresent
1,41737961,3103295389430403,2013-01-10,MG Diagnostic Mammo Bilateral,R,K,,,,,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/41737961/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Diagnostic Mammo Bilateral,RCC,[],0,(),NO
4,41737961,3103295389430403,2013-01-10,MG Diagnostic Mammo Bilateral,R,K,,,,,R,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/41737961/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Diagnostic Mammo Bilateral,RMLO,[],0,(),NO
5,41737961,3103295389430403,2013-01-10,MG Diagnostic Mammo Bilateral,R,K,,,,,R,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/41737961/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Diagnostic Mammo Bilateral,RMLO,[],0,(),NO
12,82818555,9133832708678084,2013-11-10,MG Diagnostic Mammo Bilateral,R,K,,,,,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/82818555/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Diagnostic Mammo Bilateral,RCC,[],0,(),NO
13,82818555,9133832708678084,2013-11-10,MG Diagnostic Mammo Bilateral,R,K,,,,,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/82818555/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Diagnostic Mammo Bilateral,RMCC,[],0,(),NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1492,87877516,2183052495313832,2021-01-09,MG Diagnostic Bilateral w/Tomo/CAD,L,K,0.0,L,2021-02-15,2021-02-22 00:00:00,L,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_2/87877516/1...,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Diagnostic Bilateral w/Tomo/CAD,L MLO ComboHD,[],0,(),NO
1509,99703607,1965937622916299,2019-12-08,MG Diagnostic Right w/CAD,R,K,0.0,R,2019-12-29,2020-01-01 00:00:00,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_2/99703607/1...,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Diagnostic Right w/CAD,R CC,[],0,(),NO
1510,99703607,1965937622916299,2019-12-08,MG Diagnostic Right w/CAD,R,K,0.0,R,2019-12-29,2020-01-01 00:00:00,R,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_2/99703607/1...,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Diagnostic Right w/CAD,R MLO,[],0,(),NO
1511,85936969,4786838232916669,2019-09-10,MG Diagnostic Right w/CAD,R,K,0.0,R,2019-10-11,2019-10-15 00:00:00,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_2/85936969/1...,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Diagnostic Right w/CAD,3D_ROUTINE+2D_ROUTINE,[],0,(),NO


In [None]:
# remove patients with breast implantat
# Find the patient IDs (empi_anon) with a breast implant
patients_with_implants = neg_group_final_filtered[
    neg_group_final_filtered["BreastImplantPresent"] == "YES"
]["empi_anon"].unique()

# Remove all rows for these patients
neg_group_final_filtered_final = neg_group_final_filtered[
    ~neg_group_final_filtered["empi_anon"].isin(patients_with_implants)
]
neg_group_final_filtered_final = neg_group_final_filtered_final[
    ~neg_group_final_filtered_final["ProtocolName"].str.contains(
        "SCC|SMLO|RMLOACIMF|RMLOAC|CCID|MLOID|MLOIMF|MCC|MLOAC|CCAC|MLOIDIMF|MLONP|CCNP|MLOAX|MLOAXIMF|TAN|CCRM|CCAX|CEDM|CESM|CCRL|MLOACNP|LMLOAC",
        case=False,
        na=False,
    )
    | neg_group_final_filtered_final["ProtocolName"].isna()
]
get_stats(neg_group_final_filtered_final)
display(neg_group_final_filtered_final)

DF shape: (135712, 44)
# Patients: 10796
# Cases: 31901

# Images: 135712



Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu,ImageLateralityFinal,ViewPosition,FinalImageType,anon_dicom_path,png_path,StudyDescription,ProtocolName,match_level,num_roi,ROI_coords,BreastImplantPresent,acc_anon_side
0,10015693,1334581155737139,2015-10-11,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,,NaT,,,,,,,,,2281263876413228,2018-01-06,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,818,R,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/10015693/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R MLO ComboHD,[],0,(),NO,1334581155737139R
1,10015693,1334581155737139,2015-10-11,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,,NaT,,,,,,,,,2281263876413228,2018-01-06,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,818,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/10015693/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R CC ComboHD,[],0,(),NO,1334581155737139R
4,10028836,7010941703189486,2014-09-03,MG Screening Bilateral,L,N,,,,,B,,NaT,,,,,,,,,7270377379697338,2015-12-30,MG Screening Bilateral,L,N,,,,,B,483,L,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/10028836/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screening Bilateral,L CC,[],0,(),NO,7010941703189486L
7,10028836,7010941703189486,2014-09-03,MG Screening Bilateral,L,N,,,,,B,,NaT,,,,,,,,,7270377379697338,2015-12-30,MG Screening Bilateral,L,N,,,,,B,483,L,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/10028836/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screening Bilateral,L MLO,[],0,(),NO,7010941703189486L
9,10028836,7010941703189486,2014-09-03,MG Screening Bilateral,R,N,,,,,B,,NaT,,,,,,,,,7270377379697338,2015-12-30,MG Screening Bilateral,R,N,,,,,B,483,R,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/10028836/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screening Bilateral,R MLO,[],0,(),NO,7010941703189486R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295917,99996622,5582628875236699,2014-07-24,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,B,,,,,B,,NaT,,,,,,,,,9655172659462321,2016-06-04,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,681,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/99996622/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R CC Combo,[],0,(),NO,5582628875236699R
295919,99999564,4369225803558884,2017-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,,NaT,,,,,,,,,8832872399780580,2019-02-27,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,673,L,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/99999564/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L CC Combo,[],0,(),NO,4369225803558884L
295921,99999564,4369225803558884,2017-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,,NaT,,,,,,,,,8832872399780580,2019-02-27,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,673,L,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/99999564/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L MLO Combo,[],0,(),NO,4369225803558884L
295922,99999564,4369225803558884,2017-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,,NaT,,,,,,,,,8832872399780580,2019-02-27,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,673,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/99999564/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R CC Combo,[],0,(),NO,4369225803558884R


In [None]:
# keep only patients of the negative group with enough 5 years follow up
neg_group_final_filtered_final.study_date_anon = pd.to_datetime(
    neg_group_final_filtered_final.study_date_anon, errors="coerce"
)

follow_up_period = neg_group_final_filtered_final.groupby("empi_anon")[
    "study_date_anon"
].agg(["min", "max"])
follow_up_period["duration_years"] = (
    follow_up_period["max"].dt.year - follow_up_period["min"].dt.year
)
patients_with_5_years = follow_up_period[follow_up_period["duration_years"] >= 5]
valid_patients = patients_with_5_years.index

neg_group_final_5_years = neg_group_final_filtered_final[
    neg_group_final_filtered_final["empi_anon"].isin(valid_patients)
].reset_index(drop=True)
get_stats(neg_group_final_5_years)
display(neg_group_final_5_years)

DF shape: (56233, 44)
# Patients: 2409
# Cases: 13011

# Images: 56233



Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu,ImageLateralityFinal,ViewPosition,FinalImageType,anon_dicom_path,png_path,StudyDescription,ProtocolName,match_level,num_roi,ROI_coords,BreastImplantPresent,acc_anon_side
0,10093833,2030506250163251,2013-06-21,MG Screening Bilateral w/CAD,L,N,,,,,B,,NaT,,,,,,,,,2500827897014911,2014-07-02,MG Screening Bilateral w/CAD,L,N,,,,,B,376,L,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_2/10093833/1...,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral w/CAD,L CC,[],0,(),NO,2030506250163251L
1,10093833,2030506250163251,2013-06-21,MG Screening Bilateral w/CAD,L,N,,,,,B,,NaT,,,,,,,,,2500827897014911,2014-07-02,MG Screening Bilateral w/CAD,L,N,,,,,B,376,L,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_2/10093833/1...,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral w/CAD,L MLO,[],0,(),NO,2030506250163251L
2,10093833,2030506250163251,2013-06-21,MG Screening Bilateral w/CAD,R,N,,,,,B,,NaT,,,,,,,,,6488770689649000,2015-07-24,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,763,R,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_2/10093833/1...,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral w/CAD,R MLO,[],0,(),NO,2030506250163251R
3,10093833,2030506250163251,2013-06-21,MG Screening Bilateral w/CAD,R,N,,,,,B,,NaT,,,,,,,,,6488770689649000,2015-07-24,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,763,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_2/10093833/1...,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral w/CAD,R CC,[],0,(),NO,2030506250163251R
4,10093833,2500827897014911,2014-07-02,MG Screening Bilateral w/CAD,L,N,,,,,B,,NaT,,,,,,,,,6488770689649000,2015-07-24,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,387,L,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_2/10093833/1...,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral w/CAD,L CC,[],0,(),NO,2500827897014911L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56228,99986224,8107409307566891,2018-05-22,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,,NaT,,,,,,,,,8848125344172315,2019-08-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,460,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/99986224/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R CC ComboHD,[],0,(),NO,8107409307566891R
56229,99986224,9061973132112039,2013-04-10,MG Screening Bilateral w/CAD,L,N,,,,,B,,NaT,,,,,,,,,1388973192449589,2015-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,745,L,MLO,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/99986224/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screening Bilateral w/CAD,L MLO,[],0,(),NO,9061973132112039L
56230,99986224,9061973132112039,2013-04-10,MG Screening Bilateral w/CAD,L,N,,,,,B,,NaT,,,,,,,,,1388973192449589,2015-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,745,L,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/99986224/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screening Bilateral w/CAD,L CC,[],0,(),NO,9061973132112039L
56231,99986224,9061973132112039,2013-04-10,MG Screening Bilateral w/CAD,R,N,,,,,B,,NaT,,,,,,,,,1388973192449589,2015-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,745,R,CC,2D,/mnt/NAS2/mammo/anon_dicom/cohort_1/99986224/1...,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screening Bilateral w/CAD,R CC,[],0,(),NO,9061973132112039R


In [None]:
def anon_dicom_path_fix(DICOMPathStr):
    return DICOMPathStr.replace("/mnt/NAS2/mammo/anon_dicom", "/storage2/images")


result_df_neg_group_final_new_path = neg_group_final_5_years.copy()
result_df_neg_group_final_new_path["anon_dicom_path_local"] = (
    result_df_neg_group_final_new_path["anon_dicom_path"].apply(anon_dicom_path_fix)
)
result_df_neg_group_final_new_path = result_df_neg_group_final_new_path.drop(
    columns=["anon_dicom_path"]
)
result_df_neg_group_final_new_path = result_df_neg_group_final_new_path.rename(
    columns={"anon_dicom_path_local": "anon_dicom_path"}
)

display(result_df_neg_group_final_new_path)

Unnamed: 0,empi_anon,acc_anon,study_date_anon,desc,side,asses,path_severity,bside,procdate_anon,pdate_anon,exam_laterality,acc_anon_dx,study_date_anon_dx,desc_dx,side_dx,asses_dx,path_severity_dx,bside_dx,procdate_anon_dx,pdate_anon_dx,delta_date_dx,acc_anon_1yrfu,study_date_anon_1yrfu,desc_1yrfu,side_1yrfu,asses_1yrfu,path_severity_1yrfu,bside_1yrfu,procdate_anon_1yrfu,pdate_anon_1yrfu,exam_laterality_1yrfu,delta_date_1yrfu,ImageLateralityFinal,ViewPosition,FinalImageType,png_path,StudyDescription,ProtocolName,match_level,num_roi,ROI_coords,BreastImplantPresent,acc_anon_side,anon_dicom_path
0,10093833,2030506250163251,2013-06-21,MG Screening Bilateral w/CAD,L,N,,,,,B,,NaT,,,,,,,,,2500827897014911,2014-07-02,MG Screening Bilateral w/CAD,L,N,,,,,B,376,L,CC,2D,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral w/CAD,L CC,[],0,(),NO,2030506250163251L,/storage2/images/cohort_2/10093833/1.2.846.113...
1,10093833,2030506250163251,2013-06-21,MG Screening Bilateral w/CAD,L,N,,,,,B,,NaT,,,,,,,,,2500827897014911,2014-07-02,MG Screening Bilateral w/CAD,L,N,,,,,B,376,L,MLO,2D,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral w/CAD,L MLO,[],0,(),NO,2030506250163251L,/storage2/images/cohort_2/10093833/1.2.846.113...
2,10093833,2030506250163251,2013-06-21,MG Screening Bilateral w/CAD,R,N,,,,,B,,NaT,,,,,,,,,6488770689649000,2015-07-24,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,763,R,MLO,2D,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral w/CAD,R MLO,[],0,(),NO,2030506250163251R,/storage2/images/cohort_2/10093833/1.2.846.113...
3,10093833,2030506250163251,2013-06-21,MG Screening Bilateral w/CAD,R,N,,,,,B,,NaT,,,,,,,,,6488770689649000,2015-07-24,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,763,R,CC,2D,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral w/CAD,R CC,[],0,(),NO,2030506250163251R,/storage2/images/cohort_2/10093833/1.2.846.113...
4,10093833,2500827897014911,2014-07-02,MG Screening Bilateral w/CAD,L,N,,,,,B,,NaT,,,,,,,,,6488770689649000,2015-07-24,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,387,L,CC,2D,/mnt/PACS_NAS1/mammo/png/cohort_2/extracted-im...,MG Screening Bilateral w/CAD,L CC,[],0,(),NO,2500827897014911L,/storage2/images/cohort_2/10093833/1.2.842.113...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56228,99986224,8107409307566891,2018-05-22,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,,NaT,,,,,,,,,8848125344172315,2019-08-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,460,R,CC,2D,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R CC ComboHD,[],0,(),NO,8107409307566891R,/storage2/images/cohort_1/99986224/1.2.843.113...
56229,99986224,9061973132112039,2013-04-10,MG Screening Bilateral w/CAD,L,N,,,,,B,,NaT,,,,,,,,,1388973192449589,2015-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,745,L,MLO,2D,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screening Bilateral w/CAD,L MLO,[],0,(),NO,9061973132112039L,/storage2/images/cohort_1/99986224/1.2.843.113...
56230,99986224,9061973132112039,2013-04-10,MG Screening Bilateral w/CAD,L,N,,,,,B,,NaT,,,,,,,,,1388973192449589,2015-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,L,N,,,,,B,745,L,CC,2D,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screening Bilateral w/CAD,L CC,[],0,(),NO,9061973132112039L,/storage2/images/cohort_1/99986224/1.2.843.113...
56231,99986224,9061973132112039,2013-04-10,MG Screening Bilateral w/CAD,R,N,,,,,B,,NaT,,,,,,,,,1388973192449589,2015-04-25,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R,N,,,,,B,745,R,CC,2D,/mnt/PACS_NAS1/mammo/png/cohort_1/extracted-im...,MG Screening Bilateral w/CAD,R CC,[],0,(),NO,9061973132112039R,/storage2/images/cohort_1/99986224/1.2.843.113...


In [None]:
result_df_neg_group_final_new_path[columns_to_save].to_csv(
    "NEGATIVE_GROUP_FINAL.csv", index=False
)

In [9]:
import pandas as pd

neg_path = "/content/NEGATIVE_GROUP_FINAL.csv"
df_neg_group = pd.read_csv(neg_path)
get_stats(df_neg_group)

DF shape: (56233, 15)
# Patients: 2409
# Cases: 13011

# Images: 56233



In [11]:
# Step 1: Define a helper function to determine rows to keep
def filter_images(group):
    # Count protocol frequencies within the examination
    protocol_counts = group["ProtocolName"].value_counts()

    # Add protocol frequency column
    group["ProtocolFrequency"] = group["ProtocolName"].map(protocol_counts)

    # Sort by ViewPosition, ProtocolFrequency, and keep first occurrence if there's a tie
    group = group.sort_values(
        by=["ViewPosition", "ProtocolFrequency"], ascending=[True, False]
    )
    group = group.drop_duplicates(
        subset=["ImageLateralityFinal", "ViewPosition"], keep="last"
    )
    # Drop the helper column before returning
    group = group.drop(columns=["ProtocolFrequency"])
    return group


# Step 2: Apply the helper function group-wise
result_df_neg_group = df_neg_group.groupby(
    ["empi_anon", "acc_anon"], group_keys=False
).apply(filter_images)
# Step 3: Reset index if necessary
result_df_neg_group = result_df_neg_group.reset_index(drop=True)

  result_df_neg_group = df_neg_group.groupby(["empi_anon", "acc_anon"], group_keys=False).apply(filter_images)


In [12]:
get_stats(result_df_neg_group)
display(result_df_neg_group)

DF shape: (49355, 15)
# Patients: 2409
# Cases: 13011

# Images: 49355



Unnamed: 0,empi_anon,acc_anon,anon_dicom_path,desc,ProtocolName,asses,path_severity,study_date_anon,side,ImageLateralityFinal,bside,ViewPosition,match_level,num_roi,ROI_coords
0,10093833,2030506250163251,/storage2/images/cohort_2/10093833/1.2.846.113...,MG Screening Bilateral w/CAD,L CC,N,,2013-06-21,L,L,,CC,[],0,()
1,10093833,2030506250163251,/storage2/images/cohort_2/10093833/1.2.846.113...,MG Screening Bilateral w/CAD,R CC,N,,2013-06-21,R,R,,CC,[],0,()
2,10093833,2030506250163251,/storage2/images/cohort_2/10093833/1.2.846.113...,MG Screening Bilateral w/CAD,L MLO,N,,2013-06-21,L,L,,MLO,[],0,()
3,10093833,2030506250163251,/storage2/images/cohort_2/10093833/1.2.846.113...,MG Screening Bilateral w/CAD,R MLO,N,,2013-06-21,R,R,,MLO,[],0,()
4,10093833,2500827897014911,/storage2/images/cohort_2/10093833/1.2.842.113...,MG Screening Bilateral w/CAD,L CC,N,,2014-07-02,L,L,,CC,[],0,()
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49350,99986224,8107409307566891,/storage2/images/cohort_1/99986224/1.2.843.113...,MG Screen Bilat w/Tomo/CAD Stnd Protocol,R MLO ComboHD,N,,2018-05-22,R,R,,MLO,[],0,()
49351,99986224,9061973132112039,/storage2/images/cohort_1/99986224/1.2.843.113...,MG Screening Bilateral w/CAD,L CC,N,,2013-04-10,L,L,,CC,[],0,()
49352,99986224,9061973132112039,/storage2/images/cohort_1/99986224/1.2.843.113...,MG Screening Bilateral w/CAD,R CC,N,,2013-04-10,R,R,,CC,[],0,()
49353,99986224,9061973132112039,/storage2/images/cohort_1/99986224/1.2.843.113...,MG Screening Bilateral w/CAD,L MLO,N,,2013-04-10,L,L,,MLO,[],0,()


In [13]:
result_df_neg_group.to_csv("NEGATIVE_GROUP_FINAL_2.csv", index=False)

# END