In [23]:
import os
import cv2
import glob
import shutil
import urllib
import numpy as np
import pandas as pd
from PIL import Image
import seaborn as sns
from time import time
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow.keras as keras
from tensorflow.keras import optimizers
import tensorflow.keras.models as models
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
import tensorflow.keras.estimator as estimator
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.applications import vgg16, vgg19, resnet, inception_v3
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, InputLayer
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, array_to_img

np.random.seed(123)
img_dim = (300, 200)
data_dir = '/Users/jacksimonson/data/'
model_zoo = '/Users/jacksimonson/models/research/object_detection/g3doc/tf2_detection_zoo.md'

In [24]:
## Write label-map pbtxt file
label_map = {1: 'benign', 1: 'malignant'}

def write_label_map(data_dir, mapping):
    with open("{}label_map.pbtxt".format(data_dir), 'a') as the_file:
        for i, lab in mapping.items():
            the_file.write('item\n')
            the_file.write('{\n')
            the_file.write('name: "{}"'.format(lab))
            the_file.write('\n')
            the_file.write("id: {}".format(i))
            the_file.write('\n')
            the_file.write('display_name: "{}"'.format(lab))
            the_file.write('\n')
            the_file.write('}\n')
    print('Saved to {}label_map.pbtxt'.format(data_dir))

In [25]:
write_label_map(data_dir, label_map)

Saved to /Users/jacksimonson/data/label_map.pbtxt


In [26]:
def extract_bboxes(mask_path, img_dim):

    """Compute bounding boxes from masks.

    mask: [height, width, num_instances]. Mask pixels are either 1 or 0.

 

    Returns: bbox array [num_instances, (y1, x1, y2, x2)].

    """
    mask = img_to_array(load_img(mask_path, target_size = img_dim))
    mask[mask != 0] = 1

    m = mask[:, :, 0]

    # Bounding box.

    horizontal_indicies = np.where(np.any(m, axis=0))[0]

    vertical_indicies = np.where(np.any(m, axis=1))[0]

    if horizontal_indicies.shape[0]:

        x1, x2 = horizontal_indicies[[0, -1]]

        y1, y2 = vertical_indicies[[0, -1]]

        # x2 and y2 should not be part of the box. Increment by 1.

        x2 += 1

        y2 += 1

    else:

        # No mask for this instance. Might happen due to

        # resizing or cropping. Set bbox to zeros

        x1, x2, y1, y2 = 0, 0, 0, 0

    return [x1, y1, x2, y2]

In [27]:
# for img in mask_train:
#     file = img.split('/')[-1].replace('.png', '.jpg')
#     img_path = os.path.join(os.path.join(data_dir, train_mask_path), img)
#     print(img_path)
#     im = cv2.imread(img_path, cv2.IMREAD_COLOR)
#     print(im.shape)
#     cv2.imwrite(os.path.join(os.path.join(data_dir, train_mask_jpg), file), im)

In [28]:
# for img in mask_test:
#     file = img.split('/')[-1].replace('.png', '.jpg')
#     img_path = os.path.join(os.path.join(data_dir, test_mask_path), img)
#     print(img_path)
#     im = cv2.imread(img_path, cv2.IMREAD_COLOR)
#     print(im.shape)
#     cv2.imwrite(os.path.join(os.path.join(data_dir, test_mask_jpg), file), im)

In [29]:
train_image_path = 'training-image'
train_jpg = 'training-image-jpg'
test_image_path = 'test-image'
test_jpg = 'test-image-jpg'
train_mask_path = 'training-mask'
train_mask_jpg = 'training-mask-jpg'
test_mask_path = 'test-mask'
test_mask_jpg = 'test-mask-jpg'

train_images = [x for x in os.listdir(os.path.join(data_dir,train_jpg)) if x.endswith('.jpg')]
test_images = [x for x in os.listdir(os.path.join(data_dir,test_jpg)) if x.endswith('.jpg')]
mask_train = [x for x in os.listdir(os.path.join(data_dir,train_mask_jpg)) if x.endswith('.jpg')]
mask_test = [x for x in os.listdir(os.path.join(data_dir,test_mask_jpg)) if x.endswith('.jpg')]

# files = train_images, train_image_path, train_jpg
# for img in train_images:
#     file = img.split('/')[-1].replace('.png', '.jpg')
#     img_path = os.path.join(os.path.join(data_dir, train_image_path), img)
#     im = cv2.imread(img_path, cv2.IMREAD_COLOR)
#     cv2.imwrite(os.path.join(os.path.join(data_dir, train_jpg), file), im)

# for img in test_images:
#     file = img.split('/')[-1].replace('.png', '.jpg')
#     img_path = os.path.join(os.path.join(data_dir, test_image_path), img)
#     im = cv2.imread(img_path, cv2.IMREAD_COLOR)
#     cv2.imwrite(os.path.join(os.path.join(data_dir, test_jpg), file), im)
    
# for img in mask_train:
#     file = img.split('/')[-1].replace('.png', '.jpg')
#     img_path = os.path.join(os.path.join(data_dir, train_mask_path), img)
#     im = cv2.imread(img_path, cv2.IMREAD_COLOR)
#     cv2.imwrite(os.path.join(os.path.join(data_dir, train_mask_jpg), file), im)
    
# for img in mask_test:
#     file = img.split('/')[-1].replace('.png', '.jpg')
#     img_path = os.path.join(os.path.join(data_dir, test_mask_path), img)
#     im = cv2.imread(img_path, cv2.IMREAD_COLOR)
#     cv2.imwrite(os.path.join(os.path.join(data_dir, test_mask_jpg), file), im)


# find intersection of mammograms and masks
train_files = sorted(list(set(train_images).intersection(set(mask_train))))
test_files = sorted(list(set(test_images).intersection(set(mask_test))))

train_file_names = [x for x in train_files if x.endswith('.jpg')]
test_file_names = [x for x in test_files if x.endswith('.jpg')]

train_files = [os.path.join(os.path.join(data_dir, train_jpg), img) for img in train_file_names]
train_images = []
for img in train_files:
    if not img.endswith('.jpg'):
        continue
    im = cv2.imread(img, cv2.IMREAD_COLOR)
    x = cv2.resize(im, img_dim)
    train_images += [x]
train_images = np.array(train_images)

    
test_files = [os.path.join(os.path.join(data_dir, test_jpg), img) for img in test_file_names]
test_images = []
for img in test_files:
    if not img.endswith('.jpg'):
        continue
    im = cv2.imread(img, cv2.IMREAD_COLOR)
    x = cv2.resize(im, img_dim)
    test_images += [x]
test_images = np.array(test_images)

# encode labels
le = LabelEncoder()

train_labels = [fn.split('/')[-1].split('_')[-1].replace('.jpg','').lower().strip() for fn in train_files]
le.fit(train_labels)
train_labels = le.transform(train_labels)

test_labels = [fn.split('/')[-1].split('_')[-1].replace('.jpg','').lower().strip() for fn in test_files]
test_labels = le.transform(test_labels)

# get bounding boxes
mask_train_images = [os.path.join(os.path.join(data_dir, train_mask_jpg), img_name) for img_name in train_file_names]
mask_test_images = [os.path.join(os.path.join(data_dir, test_mask_jpg), img_name) for img_name in test_file_names]

# get bounding boxes
train_bound_boxes = np.array([extract_bboxes(img, img_dim) for img in mask_train_images if not img.endswith('.DS_Store')])
test_bound_boxes = np.array([extract_bboxes(img, img_dim) for img in mask_train_images if not img.endswith('.DS_Store')])

In [37]:
np.save('/Users/jacksimonson/data/training-labels.npy', train_labels)
np.save('/Users/jacksimonson/data/test-labels.npy', test_labels)
np.save('/Users/jacksimonson/data/test-images-array.npy', test_images)
np.save('/Users/jacksimonson/data/training-images-array.npy', train_images)
np.save('/Users/jacksimonson/data/training_bound_boxes.npy', train_bound_boxes)
np.save('/Users/jacksimonson/data/test_bound_boxes.npy', test_bound_boxes)

In [56]:
train_files

['/Users/jacksimonson/data/training-image-jpg/Calc-Training_P_00005_RIGHT_CC_MALIGNANT.jpg',
 '/Users/jacksimonson/data/training-image-jpg/Calc-Training_P_00005_RIGHT_MLO_MALIGNANT.jpg',
 '/Users/jacksimonson/data/training-image-jpg/Calc-Training_P_00007_LEFT_CC_BENIGN.jpg',
 '/Users/jacksimonson/data/training-image-jpg/Calc-Training_P_00007_LEFT_MLO_BENIGN.jpg',
 '/Users/jacksimonson/data/training-image-jpg/Calc-Training_P_00011_LEFT_CC_BENIGN.jpg',
 '/Users/jacksimonson/data/training-image-jpg/Calc-Training_P_00011_LEFT_MLO_BENIGN.jpg',
 '/Users/jacksimonson/data/training-image-jpg/Calc-Training_P_00012_LEFT_CC_MALIGNANT.jpg',
 '/Users/jacksimonson/data/training-image-jpg/Calc-Training_P_00012_LEFT_MLO_MALIGNANT.jpg',
 '/Users/jacksimonson/data/training-image-jpg/Calc-Training_P_00013_RIGHT_MLO_BENIGN.jpg',
 '/Users/jacksimonson/data/training-image-jpg/Calc-Training_P_00014_LEFT_CC_MALIGNANT.jpg',
 '/Users/jacksimonson/data/training-image-jpg/Calc-Training_P_00014_LEFT_MLO_MALIGNANT.

In [50]:
labels = np.load('/Users/jacksimonson/data/training-labels.npy')

In [51]:
labels[:5]

array([1, 1, 0, 0, 0])

In [53]:
imgs = np.load('/Users/jacksimonson/data/training-images-array.npy')

In [57]:
np.save('/Users/jacksimonson/data/single_imgs_malignant.npy', imgs[0])
np.save('/Users/jacksimonson/data/single_imgs_benign.npy', imgs[2])

In [19]:
# train_labels = np.load('/Users/jacksimonson/data/training-labels.npy')
# test_labels = np.load('/Users/jacksimonson/data/test-labels.npy')

In [38]:
class_map = {'benign':1, 'malignant':2}

In [21]:
# train_images = np.load('/Users/jacksimonson/data/training-images-array.npy')
# test_images = np.load('/Users/jacksimonson/data/test-images-array.npy')
# train_images.shape

In [39]:
import tensorflow as tf


def int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def int64_list_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


def bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def bytes_list_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))


def float_list_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


def read_examples_list(path):
    """Read list of training or validation examples.

    The file is assumed to contain a single example per line where the first
    token in the line is an identifier that allows us to find the image and
    annotation xml for that example.

    For example, the line:
    xyz 3
    would allow us to find files xyz.jpg and xyz.xml (the 3 would be ignored).

    Args:
    path: absolute path to examples list file.

    Returns:
    list of example identifiers (strings).
    """
    with tf.gfile.GFile(path) as fid:
        lines = fid.readlines()
    return [line.strip().split(' ')[0] for line in lines]


def recursive_parse_xml_to_dict(xml):
    """Recursively parses XML contents to python dict.

    We assume that `object` tags are the only ones that can appear
    multiple times at the same level of a tree.

    Args:
    xml: xml tree obtained by parsing XML file contents using lxml.etree

    Returns:
    Python dictionary holding XML contents.
    """
    if not xml:
        return {xml.tag: xml.text}
    result = {}
    for child in xml:
        child_result = recursive_parse_xml_to_dict(child)
        if child.tag != 'object':
            result[child.tag] = child_result[child.tag]
        else:
            if child.tag not in result:
                result[child.tag] = []
        result[child.tag].append(child_result[child.tag])
    return {xml.tag: result}

In [40]:
def create_cat_tf_example(img_data):
    """Creates a tf.Example proto from sample cat image.

    Args:
    encoded_cat_image_data: The jpg encoded data of the cat image.

    Returns:
    example: The created tf.Example.
    """
    img, box = img_data
    im = cv2.imread(img, cv2.IMREAD_COLOR)
    im = cv2.resize(im, (200, 300))
    fp = open(img, 'rb')
    im_bytes = fp.read()
    
    x1,y1,x2,y2 = box

    filename = tf.compat.as_bytes(img.split('/')[-1])
    image_format = b'jpg'

    encoded_image_data = im_bytes
    
    
    height = im.shape[0]
    width = im.shape[1]

    xmins = [x1/width]
    xmaxs = [x2/width]
    ymins = [y1/height]
    ymaxs = [y2/height]
    classes_text = [tf.compat.as_bytes('benign')] if 'benign' in img.lower() else [tf.compat.as_bytes('malignant')]
    classes = [1] if 'benign' in img.lower() else [2]

    tf_example = tf.train.Example(features=tf.train.Features(feature={
      'image/height': int64_feature(height),
      'image/width': int64_feature(width),
      'image/filename': bytes_feature(filename),
      'image/source_id': bytes_feature(filename),
      'image/encoded': bytes_feature(encoded_image_data),
      'image/format': bytes_feature(image_format),
      'image/object/bbox/xmin': float_list_feature(xmins),
      'image/object/bbox/xmax': float_list_feature(xmaxs),
      'image/object/bbox/ymin': float_list_feature(ymins),
      'image/object/bbox/ymax': float_list_feature(ymaxs),
      'image/object/class/text': bytes_list_feature(classes_text),
      'image/object/class/label': int64_list_feature(classes)}))

    return tf_example

In [44]:
writer = tf.io.TFRecordWriter(os.path.join(data_dir, 'train.tfrecord'))
for img_data in zip(train_files, train_bound_boxes):
    tf_example = create_cat_tf_example(img_data)
    writer.write(tf_example.SerializeToString())
writer.close()

In [46]:
writer = tf.io.TFRecordWriter(os.path.join(data_dir, 'test.tfrecord'))
for img_data in zip(test_files, test_bound_boxes):
    tf_example = create_cat_tf_example(img_data)
    writer.write(tf_example.SerializeToString())
writer.close()

In [47]:
import tensorflow as tf

raw_dataset = tf.data.TFRecordDataset(os.path.join(data_dir, 'train.tfrecord'))

shards = 50

for i in range(shards):
    writer = tf.data.experimental.TFRecordWriter(f"{os.path.join(data_dir, 'train')}-{i}.tfrecord")
    writer.write(raw_dataset.shard(shards, i))

In [49]:
raw_dataset = tf.data.TFRecordDataset(os.path.join(data_dir, 'test.tfrecord'))

shards = 12

for i in range(shards):
    writer = tf.data.experimental.TFRecordWriter(f"{os.path.join(data_dir, 'test')}-{i}.tfrecord")
    writer.write(raw_dataset.shard(shards, i))

In [None]:
# Push individual shards instead of one big one
# Update config file, push to repo
# Set up Paperspace project using this structure instead