In [1]:
#Use a trained model to predict Breast Cancer
#Make sure that the parent folder of Initial Model is within the same directory as the Data Folder with all the images
"""
Example:
+ Data
+ DashApp
  |--- eda
        |--- BREAST_CANCER
"""
#Comment this to use GPU
import os

## Import and Prepare the Data

In [2]:
import os
#Global variables
DATA_PATH = os.path.join("..","..","..","Data")
RANDOM_SEED = 42
TEST_FRACTION = 0.2 # Just for initial test it must be something like 0.2
VALIDATION_FRACTION = 0.2 # Use the 20% of the non test samples to build the validation set (the remaining is the Test Set)
# Input patches are 50X50
IMAGE_WIDTH = 50 
IMAGE_LENGTH = 50

In [5]:
import numpy as np
np.random.seed(RANDOM_SEED)
import pandas as pd
from sklearn.model_selection import train_test_split
import glob

In [7]:
# read all patches
patches = []
for path in glob.glob(os.path.join(DATA_PATH,'**/*class*.png'), recursive = True):
    patches.append(path)

In [63]:
def createDataFrame():
    # Keep info within a Pandas dataframe
    data = pd.DataFrame({"path" : patches})
    # Obtain patient information by spliting the filename
    series_info = data["path"].str.split("idx").str[1].str.split("_")
    data['id'] = df["path"].str.split(os.sep).str[4] # this get the idx portion
    data['x'] = series_info.str[1].str[1:] # this get the x coord
    data['y'] = series_info.str[2].str[1:] # this get the y coord
    data['class'] = series_info.str[3] # this get the class
    data['class'] = data["class"].map({"class0.png" : "0" , "class1.png" : "1"}).astype(int) #map to a number
    # sample the dataframe in such way that the data per class is balanced (same number of samples)
    return data
df = createDataFrame()
df.head()

Unnamed: 0,path,id,x,y,class
0,..\..\..\Data\10253\0\10253_idx5_x1001_y1001_c...,10253,1001,1001,0
1,..\..\..\Data\10253\0\10253_idx5_x1001_y1051_c...,10253,1001,1051,0
2,..\..\..\Data\10253\0\10253_idx5_x1001_y1101_c...,10253,1001,1101,0
3,..\..\..\Data\10253\0\10253_idx5_x1001_y1151_c...,10253,1001,1151,0
4,..\..\..\Data\10253\0\10253_idx5_x1001_y1201_c...,10253,1001,1201,0


In [74]:
# Create a new data frame grouped by patient ID
df_patients = df['class'].groupby(df['id']).sum().reset_index()
df_patients = df_patients.rename(columns={'class' : 'cancer_patches'})
df_patients['total_patches'] = df.groupby('id')['class'].count().values
#number of patches with cancer vs the total number of patches
df_patients['cancer_proportion'] = df_patients['cancer_patches'] / df_patients['total_patches'] 
df_patients.sort_values(by = 'cancer_proportion', ascending = False)

Unnamed: 0,id,cancer_patches,total_patches,cancer_proportion
154,14209,309,342,0.903509
259,9262,80,94,0.851064
65,12873,232,281,0.825623
230,9077,1263,1623,0.778189
40,12241,115,152,0.756579
...,...,...,...,...
45,12750,21,1434,0.014644
180,16085,24,1937,0.012390
13,10272,25,2175,0.011494
11,10268,23,2109,0.010906


In [129]:
# export this quite tiny csv
df_patients.to_csv('patients.csv', index=False)

In [128]:
#Given a patient obtain their images  stored in aws S3
from boto3 import client

def is_image_of_patient(patient_id, key):
    print(key.split("/")[-2], patient_id)
    return key.split("/")[-2] == patient_id

def obtain_patient_images(patient_id, tile_size=500):
    images = []
    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket('breast-cancer-data')

    for object_summary in my_bucket.objects.filter(Prefix=f"Tiles/Tiles-{tile_size}/Tiles({tile_size}, {tile_size})/{patient_id}/"):
        images.append(object_summary.key)
    return images
                
obtain_patient_images(1, 500)

[]

In [135]:
def obtain_patients_by_cancer_proportion(df, cancer_proportion, K = 5):
    filtered_patients = df.iloc[(df['cancer_proportion']-cancer_proportion).abs().argsort()[:K]].astype(int)
    return list(filtered_patients['id'].values)
obtain_patients_by_cancer_proportion(df_patients, 5)

[14209, 9262, 12873, 9077, 12241]