In [1]:
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.models import load_model
import h5py
from tensorflow.keras import __version__ as keras_version
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Flatten, Dense, Activation, Lambda, Cropping2D, Dropout, BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import GlobalMaxPooling2D

In [2]:
import os
import csv
import cv2
import sys
import glob
import random
import collections
import pandas as pd
import itertools
import numpy as np
import seaborn as sns
from tqdm import tqdm
from google.colab import drive

from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn import model_selection

import xml.etree.ElementTree as ET

In [3]:
!ls

sample_data


In [4]:
%cd ..

/


In [5]:
!ls

bin	 datalab  home	 lib64	opt	    root  srv		     tmp    var
boot	 dev	  lib	 media	proc	    run   sys		     tools
content  etc	  lib32  mnt	python-apt  sbin  tensorflow-1.15.2  usr


In [6]:
drive.mount('content/gdrive')

Mounted at content/gdrive


In [7]:
!ln -s /content/gdrive/My\ Drive/ /mydrive

In [8]:
!ls /mydrive

'CMPE 255 Social Distance Detection_Final.pptx'
'Colab Notebooks'
'Copy of Lesion Detection for ESD in Colon.gslides'
 Indeed_Report.gdoc
'My Class Schedule.pdf'
 MyResume.pdf
 PolypsSet
 Research_Paper_Final.pptx
'SJSU Self-Service Transcript.pdf'
 SocialDistanceDetection.docx
'SPARSHA_Mentor Family Agreement.docx.docx'
 SparshaRamakrishna_Resume.pdf
'Untitled document.gdoc'
'Untitled presentation.gslides'


In [9]:
!ls /mydrive/PolypsSet/

test2019  train2019  val2019


In [10]:
def get_filepaths(basepath, remove_ext=False):
    files = []
    filenames = []
    for filename in os.listdir(basepath):
#         print("filename =", filename)
        if remove_ext is True:
            file_name, file_ext = filename.split(".")
#             print("file_name =", file_name)
#             print("file_ext =", file_ext)
            filepath = os.path.join(basepath, file_name)
            files.append(filepath)
            filenames.append(file_name)
        else:
            filepath = os.path.join(basepath, filename)
            files.append(filepath)
            filenames.append(filename)
    return files, filenames

In [11]:
def get_filepaths_videonum_dirs_xy(X_basepath, y_basepath):
    videonum_X_filepaths = []
    videonum_X_filenames = []
    videonum_y_filepaths = []
    videonum_y_filenames = []
    for videonum_X_dir, videonum_y_dir in zip(os.listdir(X_basepath), os.listdir(y_basepath)):
        videonum_X_dirpath = os.path.join(X_basepath, videonum_X_dir)
        videonum_y_dirpath = os.path.join(y_basepath, videonum_y_dir)
        videonum_y_dirfilepaths_tmp = []
        videonum_y_dirfilenames_tmp = []
        videonum_X_dirfilepaths, videonum_X_dirfilenames = get_filepaths(videonum_X_dirpath, remove_ext=True)
        videonum_y_dirfilepaths, videonum_y_dirfilenames = get_filepaths(videonum_y_dirpath, remove_ext=True)
#         print("len(videonum_X_dirfilenames) =", len(videonum_X_dirfilenames))
#         print("len(videonum_y_dirfilenames) =", len(videonum_y_dirfilenames))
        if len(videonum_X_dirfilenames) != len(videonum_y_dirfilenames):
            for y_i in range(len(videonum_y_dirfilenames)):
                if videonum_y_dirfilenames[y_i] in videonum_X_dirfilenames:
#                     print("videonum_y_dirfilenames[y_i] =", videonum_y_dirfilenames[y_i])
#                     print("is in videonum_X_dirfilenames = ", videonum_X_dirfilenames)
                    videonum_y_dirfilenames_tmp.append(videonum_y_dirfilenames[y_i] + ".xml")
                    videonum_y_dirfilepaths_tmp.append(videonum_y_dirfilepaths[y_i] + ".xml")
            videonum_y_filepaths.extend(videonum_y_dirfilepaths_tmp)
            videonum_y_filenames.extend(videonum_y_dirfilenames_tmp)
#             print("len(videonum_X_dirfilenames) =", len(videonum_X_dirfilenames))
#             print("len(videonum_y_dirfilenames_tmp) =", len(videonum_y_dirfilenames_tmp))
        else:
            videonum_y_dirfilepaths = [filepath + ".xml" for filepath in videonum_y_dirfilepaths]
            videonum_y_dirfilenames = [filename + ".xml" for filename in videonum_y_dirfilenames]
            videonum_y_filepaths.extend(videonum_y_dirfilepaths)
            videonum_y_filenames.extend(videonum_y_dirfilenames)
#             print("len(videonum_X_dirfilenames) =", len(videonum_X_dirfilenames))
#             print("len(videonum_y_dirfilenames) =", len(videonum_y_dirfilenames))
        videonum_X_dirfilepaths = [filepath + ".jpg" for filepath in videonum_X_dirfilepaths]
        videonum_X_dirfilenames = [filename + ".jpg" for filename in videonum_X_dirfilenames]
        videonum_X_filepaths.extend(videonum_X_dirfilepaths)
        videonum_X_filenames.extend(videonum_X_dirfilenames)
    return videonum_X_filepaths, videonum_X_filenames, videonum_y_filepaths, videonum_y_filenames

In [12]:
def read_img(img_path, flag = cv2.IMREAD_COLOR):
    # (height, width, 3)
    image = cv2.imread(img_path, flag)
    return image


In [13]:
def get_xml_label_names(xml_files):
    label_names = []
    for xml_file in tqdm(xml_files):
        train_y_tree = ET.parse(xml_file)
        train_y_root = train_y_tree.getroot()
        if train_y_root.find("object") != None:
            train_y_object = train_y_root.find("object")
            train_y_polyp_name = train_y_object.find("name").text
        else:
            train_y_polyp_name = "Not Specified"
        label_names.append(train_y_polyp_name)
    return label_names

def get_xml_boundboxes(xml_files):
    bound_boxes = []
    for xml_file in tqdm(xml_files):
        train_y_tree = ET.parse(xml_file)
        train_y_root = train_y_tree.getroot()
        if train_y_root.find("object") != None:
            train_y_object = train_y_root.find("object")
            train_y_bndbox = train_y_object.find("bndbox")
            train_y_bndbox_xmin = train_y_bndbox.find("xmin").text
            train_y_bndbox_ymin = train_y_bndbox.find("ymin").text
            train_y_bndbox_xmax = train_y_bndbox.find("xmax").text
            train_y_bndbox_ymax = train_y_bndbox.find("ymax").text
            bound_box_tuple = (train_y_bndbox_xmin, train_y_bndbox_ymin, train_y_bndbox_xmax, train_y_bndbox_ymax)
        else:
            bound_box_tuple = False
        bound_boxes.append(
            bound_box_tuple
        )
    return bound_boxes

In [14]:
def resize_image(image, size):
    resized_image = cv2.resize(image, (size, size))
    return resize_image

def get_images(image_group, size, flag = cv2.IMREAD_COLOR):
    images = []
    for image_path in tqdm(image_group):
        image = read_img(image_path, flag)
        resized_img = cv2.resize(image, (size, size))
        images.append(resized_img)
    return images

def remove_non_labeled_polyps(img_filepaths, img_filenames, polyp_names):
    cln_X_img_filepaths = []
    cln_X_img_filenames = []
    cln_y_polyp_names = []
    for img_filepath, img_filename, label in zip(img_filepaths, img_filenames, polyp_names):
        if label != "Not Specified":
            cln_X_img_filepaths.append(img_filepath)
            cln_X_img_filenames.append(img_filename)
            cln_y_polyp_names.append(label)
            
    return cln_X_img_filepaths, cln_X_img_filenames, cln_y_polyp_names

In [15]:
train_y_basepath = "/mydrive/PolypsSet/train2019/Annotation"
file_type = ".xml"
train_y_filepaths, train_y_filenames = get_filepaths(train_y_basepath, file_type)

In [16]:
train_X_basepath = "/mydrive/PolypsSet/train2019/Image"
file_type = ".jpg"
train_X_filepaths, train_X_filenames = get_filepaths(train_X_basepath, file_type)

In [17]:
train_y_polyp_names = get_xml_label_names(train_y_filepaths)

100%|██████████| 121/121 [00:01<00:00, 89.24it/s]


In [18]:

cln_train_X_filepaths, cln_train_X_filenames, cln_train_y_polyp_names = remove_non_labeled_polyps(
    train_X_filepaths, train_X_filenames, train_y_polyp_names)

In [19]:
train_X_images = get_images(cln_train_X_filepaths, 192)

100%|██████████| 114/114 [00:38<00:00,  2.94it/s]


In [20]:
X_train = np.array(train_X_images)

In [21]:
X_train = X_train / 255

In [22]:
label_enc = LabelEncoder()
train_y = label_enc.fit_transform(cln_train_y_polyp_names)
classes = ['adenomatous', 'hyperplastic']

In [23]:
import albumentations as A

In [24]:
images_to_generate = 2000

In [31]:
img_augmented_path="/mydrive/PolypsSet/train2019/aug_img_192/"
label_augmented_path="/mydrive/PolypsSet/train2019/aug_label_192/" # path to gray seg masks

In [32]:
aug = A.Compose([
    A.VerticalFlip(p=0.5),
    A.RandomRotate90(p=0.5),
    A.HorizontalFlip(p=1),
    A.Transpose(p=1),
    A.GridDistortion(p=1)
])

In [33]:
i=1
# augmenting on top of 100 images and 100 masked images gen 2000
# then save them in a folder
while i <= images_to_generate:
    number = random.randint(0, len(train_X_images[:100])-1)
    orig_image = train_X_images[number]
    orig_label = cln_train_y_polyp_names[number]
    
    augmented = aug(image = orig_image)
    transformed_image = augmented["image"]
    transformed_label = orig_label
    
    # due to opencv being bgr, so go rgb
    transformed_image = cv2.cvtColor(transformed_image, cv2.COLOR_BGR2RGB)
    
    new_image_path = "%s/aug_image_%s.jpg" %(img_augmented_path, i)
    new_label_path = "%s/aug_label_%s.txt" %(label_augmented_path, i)
    
    plt.imsave(new_image_path, transformed_image)
    file = open(new_label_path, "w")
    file.write(transformed_label)
    file.close()
    i = i+1


In [35]:
aug_train_X_basepath = "/mydrive/PolypsSet/train2019/aug_img_192"
file_type = ".jpg"
aug_train_X_filepaths, aug_train_X_filenames = get_filepaths(aug_train_X_basepath, file_type)

In [36]:
aug_train_X_images = get_images(aug_train_X_filepaths, 192)

100%|██████████| 2000/2000 [00:09<00:00, 218.24it/s]


In [37]:
aug_X_train = np.array(aug_train_X_images)

In [38]:
aug_X_train = aug_X_train / 255

In [40]:
aug_train_label_basepath = "/mydrive/PolypsSet/train2019/aug_label_192"
file_type = ".txt"
aug_train_label_filepaths, aug_train_label_filenames = get_filepaths(aug_train_label_basepath)

In [41]:
def get_txt_label_names(txt_files):
    label_names = []
    for txt_file in tqdm(txt_files):
        file = open(txt_file, 'r+')
        polyp_name = file.readline()
        label_names.append(polyp_name)
        file.close()

    return label_names

In [42]:
aug_y_train = get_txt_label_names(aug_train_label_filepaths)

100%|██████████| 2000/2000 [00:02<00:00, 847.19it/s]


In [43]:
label_enc = LabelEncoder()
train_y = label_enc.fit_transform(aug_y_train)
classes = ['adenomatous', 'hyperplastic']

In [44]:
cnn = models.Sequential([
    # 2 Convolution layers: first with 32 filters, second with 64 filters
    layers.Conv2D(filters=32, kernel_size=(3,3), activation="relu", input_shape=(192,192,3)),
    layers.MaxPooling2D((2,2)),
    
    layers.Conv2D(filters=64, kernel_size=(3,3), activation="relu"),
    layers.MaxPooling2D((2,2)),
    
    # 2 Dense layers
    layers.Flatten(),
    layers.Dense(64, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

In [45]:
cnn.compile(optimizer="adam",
            loss="binary_crossentropy",
            metrics=["accuracy"])

In [47]:
cnn.fit(aug_X_train, train_y, validation_split = 0.2, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc2fb603d50>

In [48]:
#Similar to training set add part of testing set to google drive
#Then load the portion of the testing data and evaluate it.
cnn.evaluate(aug_X_train[:100], train_y[:100])




[0.39161378145217896, 0.8500000238418579]