In [1]:
# Importing libraries
import numpy as np 
import pandas as pd 
import shutil
import os
import cv2
import glob
from PIL import Image
!pip install "/kaggle/input/dicomsdl-offline-installer/dicomsdl-0.109.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl"
import dicomsdl
import pydicom as dicom
import matplotlib.pyplot as plt
import random
import csv

import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator


Processing /kaggle/input/dicomsdl-offline-installer/dicomsdl-0.109.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
Installing collected packages: dicomsdl
Successfully installed dicomsdl-0.109.1
[0m

## Initial exploration

First, I am going to import the datasets and perform an initial exploration on them. The competition requires to classify images in a binary mode, using F1 score as evaluation criteria. Therefore, it is very important to have a balanced model, not biased towards any class.

In [2]:
# reading the csv file
train_df = pd.read_csv(r"../input/rsna-breast-cancer-detection/train.csv")
test_df = pd.read_csv(r"../input/rsna-breast-cancer-detection/test.csv")
# initial exploration
print(train_df.head())
#print(test_df.head())
# number of different patients
print("The number of unique patients on training set: "+str(len(train_df.patient_id.unique())))
print("The number of unique patients on testing set: "+str(len(test_df.patient_id.unique())))
# number of positive and negative classes
print("The distribution of training classes")
print(train_df["cancer"].value_counts())
# number of positive and negative classes grouped by machine ID
print("The distribution of training classes by machine ID")
# selecting rows based on having cancer
cancer_df = train_df[train_df['cancer'] == 1]
nocancer_df = train_df[train_df['cancer'] == 0]
print("cancer cases")
print(cancer_df.groupby(["machine_id"])["machine_id"].count())
print("no cancer cases")
print(nocancer_df.groupby(["machine_id"])["machine_id"].count())
print("cancer cases")
print(cancer_df.groupby(["biopsy"])["biopsy"].count())
print("no cancer cases")
print(nocancer_df.groupby(["biopsy"])["biopsy"].count())
print("cancer cases")
print(cancer_df.groupby(["invasive"])["invasive"].count())
print("no cancer cases")
print(nocancer_df.groupby(["invasive"])["invasive"].count())
print("cancer cases")
print(cancer_df.groupby(["implant"])["implant"].count())
print("no cancer cases")
print(nocancer_df.groupby(["implant"])["implant"].count())


   site_id  patient_id    image_id laterality view   age  cancer  biopsy  \
0        2       10006   462822612          L   CC  61.0       0       0   
1        2       10006  1459541791          L  MLO  61.0       0       0   
2        2       10006  1864590858          R  MLO  61.0       0       0   
3        2       10006  1874946579          R   CC  61.0       0       0   
4        2       10011   220375232          L   CC  55.0       0       0   

   invasive  BIRADS  implant density  machine_id  difficult_negative_case  
0         0     NaN        0     NaN          29                    False  
1         0     NaN        0     NaN          29                    False  
2         0     NaN        0     NaN          29                    False  
3         0     NaN        0     NaN          29                    False  
4         0     0.0        0     NaN          21                     True  
The number of unique patients on training set: 11913
The number of unique patients on t

The vast majority of images do not contain cancer, as I was expecting. This is a severely imbalanced dataset, which can cause the model to gravitate towards the non cancer class. For this, I am going to use an undersampling strategy at the beginning, just for creating the early version of the model.

Importantly, images with and without cancer have been taken by the same machines on a similar proportion, which should not create any spurious correlation based on machine id. 

I am a bit concerned about leaky validation, since same patients have several images. Some of these images may end up in training and validation, increasing the likelihood of a leaky validation. I will try to contain only 1 image per patient for the non_cancer class. In addition, I will name the images using a criteria patient_id+img_name+file_extension, so I can split then just by ordering them by name.

Importantly, 100% of cancer images do have biopsy, whereas a small percentage of non_cancer images have biopsy. We need to make sure that we have enough biopsy non cancer images in the training set. I will go for 50% biopsy, 50% non_biopsy.

Similarly, we need to include around 10-20 implant images in the non cancer bucket, so we make sure implant images are included. 


In [None]:
# creating directories for classes
os.makedirs(r"../working/0",exist_ok=True)
os.makedirs(r"../working/1",exist_ok=True)
# saving out directories as variables
out_cancer = r"../working/1"
out_nocancer = r"../working/0"
input_path = r"../input/rsna-breast-cancer-detection/train_images"
# creating a small training set for initial experimentation
patient_used = []
number_no_cancer = 0
number_no_cancer_bio = 0
number_no_cancer_no_bio = 0
# shuffling dataframe
train_df_sh = train_df.sample(frac=1).reset_index(drop=True)
# creating the dataframe
for index, row in train_df_sh.iterrows():
    # if the image corresponds to a patient with cancer
    if row["cancer"] == 1:
        # defining the patient id as variable
        patient_id = str(row["patient_id"])
        # defining the image name as variable
        img_name = str(row["image_id"])+".dcm"
        # output image name - combination of patiend id + image
        out_name = patient_id+"_"+img_name
        # copy this image
        shutil.copy(os.path.join(input_path,patient_id,img_name),os.path.join(out_cancer,out_name))
    else:    
        # obtaining the patient id  
        patient_id = str(row["patient_id"])
        # if we have already extracted one image from this patient
        if patient_id in patient_used:
            # skip it
            continue
        else:
            # appending the patient so we do not use it in the future
            patient_used.append(patient_id)
            # if we have copied as many non-cancer images as cancer images, break
            if number_no_cancer >= 1158:
                continue
            # grabbing the biopsy variable
            biopsy = str(row["biopsy"])
            # if this image corresponds to a biopsy case
            if biopsy == "1":
                # we increase the biopsy images counter 
                number_no_cancer_bio = number_no_cancer_bio + 1
                # if the number of biopsy images greater than 50%, we skip it
                if number_no_cancer_bio > int(1158/2)+1:
                    continue
                else:
                    # else, we include this image in the collection
                    # defining the img name
                    img_name = str(row["image_id"])+".dcm"
                    out_name = patient_id+"_"+img_name
                    # copying the image
                    shutil.copy(os.path.join(input_path,patient_id,img_name),os.path.join(out_nocancer,out_name))
                    # adding +1 in the non cancer images collected
                    number_no_cancer += 1
            # if the image do not contain biopsy, same
            else:
                # increasing the counter
                number_no_cancer_no_bio = number_no_cancer_no_bio + 1
                # if we have already grabbed more than 50% of the dataset, skip it
                if number_no_cancer_no_bio > int(1158/2)+1:
                    continue
                else:
                    # else, we include this image in the collection
                    # defining the image name
                    img_name = str(row["image_id"])+".dcm"
                    out_name = patient_id+"_"+img_name
                    # copying
                    shutil.copy(os.path.join(input_path,patient_id,img_name),os.path.join(out_nocancer,out_name))
                    # adding +1 in the non cancer images collected
                    number_no_cancer += 1


Let's take a look on how many images we have collected for each class following the previous criteria

In [None]:
print("Number of no cancer images: "+str(len(os.listdir(r"../working/0"))))
print("Number of cancer images: "+str(len(os.listdir(r"../working/1"))))

The dataset is somewhat imbalanced towards cancer images. However, this is something we can fix using clas weights on the loss function. For this early version of the model, I will proceed with this dataset.

## Training and validation set

For creating these, I will follow the typical Keras folder schema:

- training
    - class A
        - img1
        - img2
        - ...
    - class B
        - img 1
        - img 2
        - ...
- val
    - class A
        - img 1
        - img 2
        - ...
    - class B
        - img 1
        - img 2
        - ...

As there are not too many images (2.1k in total), I am going to split the sets 80-20 for training and validation.

Given the small amount of data I have for training and validation, I should be using a k-fold validation strategy. However, as I want to train the first version of the model, I will proceed with a hold-out validation strategy.

In [None]:
print("Creating the training and validation directories")
"""
Splitting in training and validation set - this will happen sequentially, so 
no same patient ends up in train and val, which could provoke a leaky validation
"""
# creating directories
os.makedirs(os.path.join("../working/training","0"))
os.makedirs(os.path.join("../working/training","1"))
os.makedirs(os.path.join("../working/val","0"))
os.makedirs(os.path.join("../working/val","1"))

# defining number of images variables
number_cancer = len(os.listdir(r"../working/1"))
number_no_cancer = len(os.listdir(r"../working/0"))

# reorganising training and validation set
for ix,item in enumerate(os.listdir(r"../working/1")):
    if ix > int(number_cancer*0.8):
        shutil.move(os.path.join(r"../working/1",item),os.path.join(r"../working/val","1",item))
    else:
        shutil.move(os.path.join(r"../working/1",item),os.path.join("../working/training","1",item))

for ix,item in enumerate(os.listdir(r"../working/0")):
    if ix > int(number_no_cancer*0.8):
        shutil.move(os.path.join(r"../working/0",item),os.path.join(r"../working/val","0",item))
    else:
        shutil.move(os.path.join(r"../working/0",item),os.path.join(r"../working/training","0",item))

print("The training and val directories have been created")

In [None]:
print("Removing original images directories")
# removing the legacy folders
shutil.rmtree(r"../working/1")
shutil.rmtree(r"../working/0")

### Pre-processing images

The images are in .dcm format, hence I will be transforming them into .png which a more commonly used format for images. Alternatively, I could be transforming them into .jpg, but this format applies a compression that I prefer avoiding.

First, I am going to do an visual analyses of those images.

In [None]:
def printing_random_images(input_dir,n_images=4):
    # selecting items
    images_selected = random.choices(os.listdir(input_dir),k=n_images)
    # instantiating the image template
    figure = plt.figure(figsize = (22,5))
    # creating the plot image
    for i, file_ in enumerate(images_selected):
        plt.subplot(1, n_images, i+1)
        #dataset = pydicom.dcmread(os.path.join(input_dir,file_))
        dataset = dicomsdl.open(os.path.join(input_dir,file_))
        try:
            #plt.imshow(dataset.pixel_array, cmap=plt.cm.bone)
            plt.imshow(dataset.pixelData(storedvalue=True), cmap=plt.cm.bone)
            plt.axis('off')
        except:
            continue

# printing training images
train_dir_nocancer = r"../working/training/0"
train_dir_cancer = r"../working/training/1"

printing_random_images(input_dir=train_dir_nocancer)
printing_random_images(input_dir=train_dir_cancer)



Both groups (cancer/no cancer) seems to contain images with white and black backgrounds. The orientation of both groups of images is similar. There is some text in the images (R-MO L-MO) which might be benefitial removing it.

### Turning images into png format

DICOM images are highly informative - these images contain information about the patient such as name, gender, doctor's name, etc. I will be turning these 2.1k images into .png format, which is a very typical image format. In addition, I will be saving such additional information into a .csv file

First, I will define a function for converting the image

In [None]:
def converting_img_2(img_path,png_ext=True):
    """
    Adapted method for turning DICOM images into PNG/JPG format
    """
    try:
        # directory path
        dir_path = os.path.dirname(img_path)
        img_name = os.path.basename(img_path)
        # reading DICOM image
        img_dcm = dicomsdl.open(img_path)
        # to PIL image
        img_pil = img_dcm.toPilImage()
        # saving as png or jpg
        if png_ext:
            image_name = img_name.replace('.dcm', '.png')
        else:
            image_name = img_name.replace('.dcm', '.jpg')
        im1 = img_pil.save(os.path.join(dir_path,image_name))
        return(True,img_path)
    except Exception as e:
        print(e)
        return(False,img_path)

def converting_img_3(img_path,out_path,png_ext=True):
    """
    Adapted method for turning DICOM images into PNG/JPG format
    """
    try:
        # directory path
        dir_path = os.path.dirname(img_path)
        img_name = os.path.basename(img_path)
        # reading DICOM image
        img_dcm = dicomsdl.open(img_path)
        # to PIL image
        img_pil = img_dcm.toPilImage()
        # saving as png or jpg
        if png_ext:
            image_name = img_name.replace('.dcm', '.png')
        else:
            image_name = img_name.replace('.dcm', '.jpg')
        im1 = img_pil.save(os.path.join(out_path,image_name))
        return(True,img_path)
    except Exception as e:
        print(e)
        return(False,img_path)

In [None]:
img_path = r"../working/test/2514_464708482.dcm"

def dcm_to_png(img_path,out_path,file_ext=".png"):
    # reading image
    img_dcm = dicomsdl.open(img_path)
    # to PIL image
    img_pil = img_dcm.toPilImage()
    # saving
    im1 = img_pil.save(os.path.join(out_path,"test_transformed"+file_ext))


In [None]:
print("Converting all images into .png")
# Converting all .dcm images into png
remove_dcm = []
for item in os.listdir(r"../working/training/0"):
    if item.endswith(".dcm"):
        response = converting_img_2(img_path=os.path.join(r"../working/training/0",item),png_ext=True)
        if not response[0]:
            print("Error while processing image: "+str(response[1]))
        else:
            remove_dcm.append(os.path.join(r"../working/training/0",item))
for item in remove_dcm:
    os.remove(item)

remove_dcm = []
for item in os.listdir(r"../working/training/1"):
    if item.endswith(".dcm"):
        response = converting_img_2(img_path=os.path.join(r"../working/training/1",item),png_ext=True)
        if not response[0]:
            print("Error while processing image: "+str(response[1]))
        else:
            remove_dcm.append(os.path.join(r"../working/training/1",item))
for item in remove_dcm:
    os.remove(item)

remove_dcm = []
for item in os.listdir(r"../working/val/0"):
    if item.endswith(".dcm"):
        response = converting_img_2(img_path=os.path.join(r"../working/val/0",item),png_ext=True)
        if not response[0]:
            print("Error while processing image: "+str(response[1]))
        else:
            remove_dcm.append(os.path.join(r"../working/val/0",item))
for item in remove_dcm:
    os.remove(item)

remove_dcm = []
for item in os.listdir(r"../working/val/1"):
    if item.endswith(".dcm"):
        response = converting_img_2(img_path=os.path.join(r"../working/val/1",item),png_ext=True)
        if not response[0]:
            print("Error while processing image: "+str(response[1]))
        else:
            remove_dcm.append(os.path.join(r"../working/val/1",item))
for item in remove_dcm:
    os.remove(item)

print("All images have been converted! ")

## Training and validating

All the data is ready for training - it has been preprocessed and grouped in training and validation. This task is a binary classification task. Importantly, there are only ca. 2k images for training, which is a relatively small number of images. For this initial trial, I will use a pretrained model and fine-tune it on our task. 

### Further data-preprocessing and augmentation

In order to make the most of our few training examples, we will "augment" them via a number of random transformations, so that our model would never see twice the exact same picture. This helps prevent overfitting and helps the model generalize better.

Adding the augmentation and pre-processing:

In [None]:
print("Defining the image data generator")
# defining the training and validation directories
train_dir = r"../working/training"
validation_dir = r"../working/val"
# defining the batch size
batch_size = 16
# defining the image target size
target_size=(224, 224)

# adding augmentation and preprocessing (floating point number)
train_datagen = ImageDataGenerator()

# for the validation set, just the preprocessing
test_datagen = ImageDataGenerator()

train_generator = train_datagen.flow_from_directory(
    train_dir,target_size=target_size,batch_size=batch_size,class_mode='binary')
validation_generator = test_datagen.flow_from_directory(
    validation_dir,target_size=target_size,batch_size=batch_size,class_mode='binary')

print(train_generator.class_indices)
print(validation_generator.class_indices)

As seen above, there are only 400 images for validation. Hence, this strategy can have a high variance - depending on the validation slot selected, the performance of the model is different. This is something to improve in the next training.

## Defining the model

I will be using Resnet50 for this problem. As we do not have much data, I will fine tune the last few layers from a pre-trained Resnet 50. Let's see how much performance we can obtain using this simple approach.

In [None]:
print("Defining the convolutional base of the ML")
# Convoluted Base MODEL

conv_base = tf.keras.applications.resnet50.ResNet50(weights=r'../input/tf-keras-pretrained-model-weights/No Top/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',
include_top=False,
input_shape=(224, 224, 3))

print(conv_base.summary())

In [None]:
print("Defining the ML model architecture")
# MODEL 1

input_layer = tf.keras.layers.Input([224, 224, 3], dtype = tf.uint8)
x = tf.keras.layers.RandomFlip(mode='horizontal')(input_layer)
x = tf.keras.layers.RandomContrast(factor=[0.95,1.05])(x)
x = tf.cast(x, tf.float32)
x = tf.keras.applications.resnet50.preprocess_input(x)
x = conv_base(x)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(inputs=[input_layer], outputs=[x])

#model = tf.keras.Sequential()
#model.add(tf.keras.layers.RandomFlip(mode='horizontal'))
#model.add(tf.keras.layers.RandomContrast(factor=[0.95,1.05]))
#model.add(tf.keras.layers.Rescaling(1./1))
#model.add(tf.keras.applications.resnet50.preprocess_input())
#model.add(conv_base)
#model.add(tf.keras.layers.GlobalAveragePooling2D())
#model.add(tf.keras.layers.Dense(1, activation='sigmoid',activity_regularizer=tf.keras.regularizers.l1(0.01)))

# MODEL 2

model2 = tf.keras.Sequential()
model2.add(conv_base)
model2.add(tf.keras.layers.Flatten())
model2.add(tf.keras.layers.Dropout(0.5))
model2.add(tf.keras.layers.Dense(64, activation='relu',kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model2.add(tf.keras.layers.Dropout(0.5))
model2.add(tf.keras.layers.Dense(1, activation='sigmoid'))

print(model.summary())

print(model2.summary())

In [None]:
# Freezing the convolutional base
for layer in conv_base.layers[:]:
    layer.trainable = False

for i, layer in enumerate(conv_base.layers):
    print(i, layer.name, layer.trainable)


In [None]:
print("Defining callbacks")
# Adding callbacks
# early stopping
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    patience=10,
    mode="max",
)
# checkpoint saving
if os.path.isdir(r"../working/training_model/model/conv_base"):
    shutil.rmtree(r"../working/training_model/model/conv_base")
    os.makedirs(r"../working/training_model/model/conv_base")
else:
    os.makedirs(r"../working/training_model/model/conv_base")
# checkpint callback for saving model
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=r"../working/training_model/model/conv_base",
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True) # only saving the best version of the model

# Defining callbacks
my_callbacks = [early_stopping_callback,model_checkpoint_callback]

# Compile frozen conv_base + my top layer
# I will start training the second model first
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9),
              loss='binary_crossentropy',
              metrics=['accuracy',tf.keras.metrics.BinaryAccuracy()])

print("model compiled")
print(model.summary())

### Note
It’s necessary to freeze the convolution base of the conv base in order to be able to train a randomly initialized classifier on top using a high learning rate. Otherwise, the error signal propagating through the network during training will be too large, and the representations previously learned by the layers being fine-tuned will be destroyed.

#### Strategy
1) Train just the classifier with predefined lr - conv_base frozen 

2) Unfreeze last model conv block, recompile and train all with LOW lr=1e-5

In [None]:
print("Training!")
#Short training ONLY my top layers 
#... so the conv_base weights will not be destroyed by the random intialization of the new weights

history = model.fit(train_generator,
                              epochs=100,
                              validation_data = validation_generator,
                              callbacks=my_callbacks)

In [None]:
print("Done! I have got to the end of the training the convolutional base!")

### Fine Tuning the model

Now, I am going to fine tune a few more layers from the Resnet50 so I can get some more performance

In [None]:
# loading model
model.load_weights(r"../working/training_model/model/conv_base")

# Make last block of the conv_base trainable:

for layer in conv_base.layers[:165]:
    layer.trainable = False
for layer in conv_base.layers[165:]:
    layer.trainable = True

print('Last block of the conv_base is now trainable')

In [None]:
for i, layer in enumerate(conv_base.layers):
    print(i, layer.name, layer.trainable)

In [None]:
print("Defining callbacks")
# Adding callbacks
# early stopping
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    patience=15,
    mode="max",
)
# checkpoint saving
if os.path.isdir(r"../working/training_model/model/fine_tuned"):
    shutil.rmtree(r"../working/training_model/model/fine_tuned")
    os.makedirs(r"../working/training_model/model/fine_tuned")
else:
    os.makedirs(r"../working/training_model/model/fine_tuned")
# checkpint callback for saving model
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=r"../working/training_model/model/fine_tuned",
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True) # only saving the best version of the model

# Defining callbacks
my_callbacks = [early_stopping_callback,model_checkpoint_callback]

# Compile frozen conv_base + my top layer
# I will start training the second model first
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.0001, momentum=0.9),
              loss='binary_crossentropy',
              metrics=['accuracy',tf.keras.metrics.BinaryAccuracy()])

print("model compiled")
print(model.summary())

In [None]:
print("Training!")
#Short training ONLY my top layers 
#... so the conv_base weights will not be destroyed by the random intialization of the new weights

history = model.fit(train_generator,
                              epochs=200,
                              validation_data = validation_generator,
                              callbacks=my_callbacks)

### Final Validation

I will reconstruct the model and deploy it on the validation set, so I can obtain the metrics of the best version

In [None]:
# Loading best weights
model.load_weights(r"../working/training_model/model/fine_tuned")

# Validating
results = model.evaluate(validation_generator)
print(results)

In [None]:
# SAVE model
model.save('RSNA_screening_Resnet50.h5')
print("RSNA_screening_Resnet50.h5 was saved")

## Submission

Finally, I will prepare the .csv file for submission.

In [None]:
print("Preparing for submission")
# loading the model
new_model = tf.keras.models.load_model(r'RSNA_screening_Resnet50.h5')
# preparing and saving images from the test directory
os.makedirs(os.path.join(r"../working/","output"),exist_ok=True)
out_temp = os.path.join(r"../working/","output")

for root, dirs, files in os.walk(r"../input/rsna-breast-cancer-detection/test_images", topdown=False):
    for name in files:
        if name.endswith(".dcm"):
            response = converting_img_3(img_path=os.path.join(root,name),out_path=out_temp,png_ext=True)
            if not response[0]:
                print("Error while processing image: "+str(response[1]))
            else:
                print("Success!")

# defining the batch size
batch_size = 1
# defining the image target size
target_size=(224, 224)

# for the validation set, just the preprocessing
test_datagen = ImageDataGenerator()

test_generator = test_datagen.flow_from_directory(
    out_temp,target_size=target_size,batch_size=batch_size,class_mode='binary')

print(test_generator.class_indices)

In [3]:
# adding column cancer
test_df['cancer']=np.random.rand(test_df.shape[0])

for item in os.listdir(out_temp):
    # defining image path
    img_path = os.path.join(out_temp,item)
    # extracting corresponding info from the dataframe
    for index,row in test_df.iterrows():
        if str(row["image_id"]) == str(item[:-4]):
            # obtaining cancer probability
            img = tf.keras.preprocessing.image.load_img(img_path, target_size=target_size)
            # creating an array
            img_array = tf.keras.preprocessing.image.img_to_array(img)
            # creating the batch axis
            img_array = tf.expand_dims(img_array, 0)
            # predicting on image
            predictions = new_model.predict(img_array)
            score = float(predictions[0])
            
            # adding the result
            row["cancer"] = score
            break


NameError: name 'out_temp' is not defined

In [None]:
# preparing submission dataframe
submission = pd.DataFrame(data={'prediction_id': test_df['prediction_id'], 'cancer': test_df['cancer']}).drop_duplicates(subset='prediction_id')


In [None]:
# submitting
submission.to_csv('submission.csv', index=False)

   site_id  patient_id    image_id laterality view  age  implant  machine_id  \
0        2       10008   736471439          L  MLO   81        0          21   
1        2       10008  1591370361          L   CC   81        0          21   
2        2       10008    68070693          R  MLO   81        0          21   
3        2       10008   361203119          R   CC   81        0          21   

  prediction_id    cancer  
0       10008_L  0.658385  
1       10008_L  0.553174  
2       10008_R  0.826566  
3       10008_R  0.766603  
