In [1]:
import os
import glob

from skimage.io import imread
from skimage.transform import resize
import numpy as np

from tensorflow.keras.utils import normalize
from sklearn.model_selection import train_test_split

In [8]:
# gethering image paths across folders and related data information from filename
# each element contain patiend_id, idx5, patch X coord, patch Y coord, class of patch ( 0 - non-IDK, 1 - IDK )

def list_of_data_images(img_folder='IDC_regular_ps50_idx5'):
    
    path = os.path.join(os.pardir, os.pardir, img_folder,'*','*','*')
    paths = glob.glob(path)

    str_preproc=[]
    for img_path in paths:
        img_name = os.path.basename(img_path)
        img_name = img_name.split('_')

        for i in range(2,len(img_name)):
            img_name[i] = int(''.join(filter(str.isdigit, img_name[i])))
        img_name.append(os.path.abspath(img_path))
        str_preproc.append(img_name)

    print(f"Number of data elements in list: {len(str_preproc)}")
    return str_preproc

In [9]:
list_data_img = list_of_data_images()
print(list_data_img[:10])

Number of data elements in list: 277524
[['10253', 'idx5', 1001, 1001, 0, 'c:\\Users\\Igor\\DSI\\workspace\\team_project2\\IDC_regular_ps50_idx5\\10253\\0\\10253_idx5_x1001_y1001_class0.png'], ['10253', 'idx5', 1001, 1051, 0, 'c:\\Users\\Igor\\DSI\\workspace\\team_project2\\IDC_regular_ps50_idx5\\10253\\0\\10253_idx5_x1001_y1051_class0.png'], ['10253', 'idx5', 1001, 1101, 0, 'c:\\Users\\Igor\\DSI\\workspace\\team_project2\\IDC_regular_ps50_idx5\\10253\\0\\10253_idx5_x1001_y1101_class0.png'], ['10253', 'idx5', 1001, 1151, 0, 'c:\\Users\\Igor\\DSI\\workspace\\team_project2\\IDC_regular_ps50_idx5\\10253\\0\\10253_idx5_x1001_y1151_class0.png'], ['10253', 'idx5', 1001, 1201, 0, 'c:\\Users\\Igor\\DSI\\workspace\\team_project2\\IDC_regular_ps50_idx5\\10253\\0\\10253_idx5_x1001_y1201_class0.png'], ['10253', 'idx5', 1001, 1251, 0, 'c:\\Users\\Igor\\DSI\\workspace\\team_project2\\IDC_regular_ps50_idx5\\10253\\0\\10253_idx5_x1001_y1251_class0.png'], ['10253', 'idx5', 1001, 1301, 0, 'c:\\Users\\Ig

In [30]:
# convert images to 3 dim  numpy array

img_size = ( 50, 50 )
images_dataset = []
for img in list_data_img:    
    image = np.asarray(imread(img[len(img)-1] ), dtype='uint8')
    if image.shape != (50, 50, 3):
        image = resize( image, img_size, mode='reflect', preserve_range=True ) # resize images to given size and dimmensions
    images_dataset.append( image )

images_dataset = np.asarray( images_dataset, dtype='float32' )                     # convert to NumPy array
images_dataset = normalize( images_dataset )
print(f"\n shape: {images_dataset.shape}")


 shape: (277524, 50, 50, 3)


### Optinal step. 
### Save and load data to pkl file.

In [2]:
import pickle

# Save the array to a pickle file
# images_dataset - NumPy dataset
# parts - split to save in separated files. More parts less memory need 
def SaveNpToPkl( filename, images_dataset, parts = 1 ):
    start_ind = 0
    step = images_dataset.shape[0] * parts//100
    for i in range(0, parts):
        filename_pkl = filename + str(i) +'.pkl'
        start_ind = step*i
        if i != parts-1:
            end_ind = step*(i+1) 
        else:
            end_ind = images_dataset.shape[0] 
    
        print( f" Dataset with indexes {start_ind}, {end_ind} saved to file {filename_pkl}" )
        with open(filename_pkl, 'wb') as f:
            pickle.dump(images_dataset[start_ind : end_ind], f)

# Load the array from the pickle file
# parts - parts to load
def LoadPklToNp( filename, parts=1 ):
    total_list = []
    paths = glob.glob(filename)
    for path in paths[:parts]:
        print(f"Loading file {path}")
        with open(path, 'rb') as f:
            part_array = pickle.load(f)
        total_list += list(part_array)
        #total_list = np.concatenate(( total_array, array), axis=0 )
    return total_list


In [25]:
# call numpy to pkl save func.

# Specify relative path and the filename
parts = 10
filename = os.path.join( os.pardir, 'data', 'processed', 'img_dataset')
SaveNpToPkl( filename, images_dataset, parts )

filename = os.path.join( os.pardir, 'data', 'processed', 'lables_data')
SaveNpToPkl( filename, np.asarray(list_data_img), parts )

 Dataset with indexes 0, 27752 saved to file ..\data\processed\img_dataset0.pkl
 Dataset with indexes 27752, 55504 saved to file ..\data\processed\img_dataset1.pkl
 Dataset with indexes 55504, 83256 saved to file ..\data\processed\img_dataset2.pkl
 Dataset with indexes 83256, 111008 saved to file ..\data\processed\img_dataset3.pkl
 Dataset with indexes 111008, 138760 saved to file ..\data\processed\img_dataset4.pkl
 Dataset with indexes 138760, 166512 saved to file ..\data\processed\img_dataset5.pkl
 Dataset with indexes 166512, 194264 saved to file ..\data\processed\img_dataset6.pkl
 Dataset with indexes 194264, 222016 saved to file ..\data\processed\img_dataset7.pkl
 Dataset with indexes 222016, 249768 saved to file ..\data\processed\img_dataset8.pkl
 Dataset with indexes 249768, 277524 saved to file ..\data\processed\img_dataset9.pkl


In [68]:
# call pkl loader
parts_load = 10
Load_path = os.path.join( os.pardir, 'data', 'processed', 'img_dataset*' )
images_dataset = np.asarray ( LoadPklToNp( Load_path, parts_load ), dtype='float32')

Load_path = os.path.join( os.pardir, 'data', 'processed', 'lables_data*' )
labels_dataset = np.asarray(np.asarray (LoadPklToNp( Load_path, parts_load ) )[:, 4:5], dtype='i1' )                   # [:, 4:5] - retrive only label values 

Loading file ..\data\processed\img_dataset0.pkl
Loading file ..\data\processed\img_dataset1.pkl
Loading file ..\data\processed\img_dataset2.pkl
Loading file ..\data\processed\img_dataset3.pkl
Loading file ..\data\processed\img_dataset4.pkl
Loading file ..\data\processed\img_dataset5.pkl
Loading file ..\data\processed\img_dataset6.pkl
Loading file ..\data\processed\img_dataset7.pkl
Loading file ..\data\processed\img_dataset8.pkl
Loading file ..\data\processed\img_dataset9.pkl
Loading file ..\data\processed\lables_data0.pkl
Loading file ..\data\processed\lables_data1.pkl
Loading file ..\data\processed\lables_data2.pkl
Loading file ..\data\processed\lables_data3.pkl
Loading file ..\data\processed\lables_data4.pkl
Loading file ..\data\processed\lables_data5.pkl
Loading file ..\data\processed\lables_data6.pkl
Loading file ..\data\processed\lables_data7.pkl
Loading file ..\data\processed\lables_data8.pkl
Loading file ..\data\processed\lables_data9.pkl


In [69]:
x_train, x_test, y_train, y_test = train_test_split( images_dataset, labels_dataset, test_size=0.15, stratify=labels_dataset, random_state=42)
x_train, x_val, y_train, y_val = train_test_split( x_train, y_train, test_size=0.15, stratify=y_train, random_state=42)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((88806, 50, 50, 3), (22202, 50, 50, 3), (88806, 1), (22202, 1))

In [62]:
train_true = np.where ( y_train==1 )
print(train_true[0][:100])
test_true = np.where ( y_test==1 )
print(test_true[0][:100])

[  1   6   7  18  19  21  25  30  33  40  46  47  48  49  51  53  56  60
  63  64  66  69  77  78  79  81  82  83  88  89  94  97 101 107 110 124
 131 135 136 138 141 143 150 152 156 157 158 164 166 170 171 173 174 178
 179 180 187 191 192 195 201 204 205 208 209 211 217 218 219 222 224 226
 227 230 231 233 238 242 248 253 255 256 259 260 263 266 274 275 277 283
 294 299 303 304 315 320 329 330 331 338]
[  1   4  14  17  18  20  21  26  28  30  33  36  37  39  41  43  59  61
  63  65  67  68  76  78  80  81  82  83  86  88  89  94  98 101 104 105
 106 107 108 110 117 121 122 125 131 135 136 140 141 144 148 149 153 154
 156 157 160 166 168 169 171 172 173 177 179 183 190 192 193 195 196 198
 201 202 205 214 219 220 223 228 232 233 239 242 245 246 247 248 249 252
 253 256 258 262 263 266 269 270 278 283]


In [79]:
from keras import optimizers
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report, accuracy_score

model_base = Sequential([
    Conv2D( filters=32, kernel_size=(3, 3), strides=2 ),
    Conv2D( filters=256, kernel_size=(3, 3), padding="same" ),
    MaxPooling2D(),
    Flatten(),
    Dense( 1, activation='softmax' )
])


model_base.compile( loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [81]:

history = model_base.fit( x_train[:100000], y_train[:100000], epochs=10, batch_size=128, validation_data=(x_val, y_val) )

Epoch 1/10

KeyboardInterrupt: 

In [None]:
pred = model_base.predict(x_test)



In [80]:
pred

array([[0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.999

In [75]:
accuracy_score(pred, y_test[:100])

ValueError: Classification metrics can't handle a mix of continuous and binary targets