# Classify Polyps for Colon Cancer

## Import Libraries

In [None]:
import os
import csv
import cv2
import sys
import collections
import pandas as pd
import itertools
import numpy as np
import seaborn as sns
from tqdm import tqdm

from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from sklearn.preprocessing import LabelEncoder

import xml.etree.ElementTree as ET

In [None]:
%matplotlib inline

## Helper Functions

In [None]:
def get_filepaths(basepath, file_type):
    files = []
    filenames = []
    for filename in os.listdir(basepath):
        if filename.endswith(file_type):
            filepath = os.path.join(basepath, filename)
            files.append(filepath)
        filenames.append(filename)
    return files, filenames

def read_img(img_path):
    image = mpimg.imread(img_path)
#     print("This image is:", type(image), "with dimensions:", image.shape)
    return image

def get_xml_label_names(xml_files):
    label_names = []
    for xml_file in tqdm(xml_files):
#         print("xml_file =", xml_file)
        train_y_tree = ET.parse(xml_file)
        train_y_root = train_y_tree.getroot()
        if train_y_root.find("object") != None:
            train_y_object = train_y_root.find("object")
            train_y_polyp_name = train_y_object.find("name").text
        else:
            train_y_polyp_name = "Not Specified"
        label_names.append(train_y_polyp_name)
    return label_names

def get_xml_boundboxes(xml_files):
    bound_boxes = []
    for xml_file in tqdm(xml_files):
        train_y_tree = ET.parse(xml_file)
        train_y_root = train_y_tree.getroot()
        if train_y_root.find("object") != None:
            train_y_object = train_y_root.find("object")
            train_y_bndbox = train_y_object.find("bndbox")
            train_y_bndbox_xmin = train_y_bndbox.find("xmin").text
            train_y_bndbox_ymin = train_y_bndbox.find("ymin").text
            train_y_bndbox_xmax = train_y_bndbox.find("xmax").text
            train_y_bndbox_ymax = train_y_bndbox.find("ymax").text
            bound_box_tuple = (train_y_bndbox_xmin, train_y_bndbox_ymin, train_y_bndbox_xmax, train_y_bndbox_ymax)
        else:
            bound_box_tuple = False
        bound_boxes.append(
            bound_box_tuple
        )
    return bound_boxes

# normalizes image pixel values betwen -0.5 and 0.5
def normalize_images(images):
    norm_images = []
    for image in tqdm(images):
        norm_img = (image/255.0) - 0.5
        norm_images.append(norm_img)
    return norm_images

# def resize_image(image, size):
#     resized_image = cv2.resize(image, (size, size))
#     return resize_image
    

def grayscale(img):
    return cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

def gaussian_blur(img, kernel_size):
    return cv2.GaussianBlur(img, (kernel_size, kernel_size), 0)

def get_images(img_files, prep = "gray"):
    images = []
    for img_file in tqdm(img_files):
        image = read_img(img_file)
        if prep == "gray":
            image = grayscale(image)
        elif prep == "gaussian":
            image = gaussian_blur(image, 9)
        else:
            image = grayscale(image)
            image = gaussian_blur(image, 9)
        images.append(image)
    return images



def change_extension(img_file, file_type):
    p = Path(img_file)
    return p.rename(p.with_suffix(file_type))
    



# loads image from filepath using opencv
def get_image(basepath, filepath):
    # read images
    source_path = filepath
    # extract filename from filepath using split and check platform
    if sys.platform == 'win32':
        filename = source_path.split("\\")[-1]
    elif sys.platform == 'linux' or sys.platform == 'darwin':
        filename = source_path.split("/")[-1]
    # add filename to end of path to PolypSet dir, so platform isn't an issue
    img_path_on_fs = basepath + filename
    # load image using opencv
    image = cv2.imread(img_path_on_fs)
    return image

## Load Training Images & Labels

In [None]:
# Extract image along with its associated boundbox
train_X_images = []
train_y_polyp_bndbox = []

## Get Polyp Names for Training Y Set

Annotation xml reference from one of the training y label files. We parse this file for polyp name and bounding box data

~~~xml
<annotation>
    <folder>16</folder>
    <filename>11.jpg</filename>
    <path>/scratch/mfathan/Thesis/Dataset/Extracted/80_Videos_Frames/pretest/16/11.jpg</path>
    <source>
        <database>Unknown</database>
    </source>
    <size>
        <width>544</width>
        <height>448</height>
        <depth>3</depth>
    </size>
    <segmented>0</segmented>
    <object>
        <name>hyperplastic</name>
        <pose>Unspecified</pose>
        <truncated>0</truncated>
        <difficult>0</difficult>
        <bndbox>
            <xmin>285</xmin>
            <ymin> 282</ymin>
            <xmax> 389</xmax>
            <ymax> 370</ymax>
        </bndbox>
    </object>
</annotation>
~~~

In [None]:
train_y_basepath = "PolypsSet/train2019/Annotation"
file_type = ".xml"
train_y_filepaths, train_y_filenames = get_filepaths(train_y_basepath, file_type)

In [None]:
train_y_polyp_names = get_xml_label_names(train_y_filepaths)

In [None]:
print(len(train_y_filepaths))
print(train_y_filepaths[0])
print(train_y_filenames[0])

## Get Images for Training X Set

In [None]:
train_X_basepath = "PolypsSet/train2019/Image"
file_type = ".jpg"
train_X_filepaths, train_X_filenames = get_filepaths(train_X_basepath, file_type)


In [None]:
print(len(train_X_filepaths))
print(train_X_filepaths[1])
print(train_X_filenames[1])

In [None]:
# train_X_images = get_images(train_X_filepaths, train_y_polyp_names): 4:39PM - 
train_X_images = get_images(train_X_filepaths, prep="gray")

In [None]:
print("len train_X_images = ", len(train_X_images))

In [None]:
plt.suptitle(train_X_filepaths[0])
plt.imshow(train_X_images[0])

In [None]:
train_X_images[0]

In [None]:
train_X_images_scaled = normalize_images(train_X_images)

In [None]:
# print("train_y_polyp_names = ", len(train_y_polyp_names))
print("train_X_images = ", len(train_X_images))

In [None]:
label_enc = LabelEncoder()

In [None]:
train_y_polyp_names_enc = label_enc.fit_transform(train_y_polyp_names)

In [None]:
train_y_polyp_names_enc

In [None]:
sns.countplot(train_y_polyp_names)

In [None]:
polyp_counter = collections.Counter(train_y_polyp_names)

In [None]:
polyp_counter

We have more cancerous polyps (adenomatous) in our training data set compared to non cancerous polyps (hyperplastic).

## Resources

- [Three Ways of Storing and Accessing Lots of Images in Python](https://realpython.com/storing-images-in-python/#reading-a-single-image)