In [None]:
#https://www.kaggle.com/arkajyotib/better-image-tiles-removing-white-spaces/edit
import os
import cv2
import PIL
import random
import openslide
import skimage.io
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image, display

def compute_statistics(image):
    """
    Args:
        image                  numpy.array   multi-dimensional array of the form WxHxC
    
    Returns:
        ratio_white_pixels     float         ratio of white pixels over total pixels in the image 
    """
    width, height = image.shape[0], image.shape[1]
    num_pixels = width * height
    
    num_white_pixels = 0
    
    summed_matrix = np.sum(image, axis=-1)
    # Note: A 3-channel white pixel has RGB (255, 255, 255)
    num_white_pixels = np.count_nonzero(summed_matrix > 620)
    ratio_white_pixels = num_white_pixels / num_pixels
    
    green_concentration = np.mean(image[1])
    blue_concentration = np.mean(image[2])
    
    return ratio_white_pixels, green_concentration, blue_concentration

def select_k_best_regions(regions, k=20):
    """
    Args:
        regions               list           list of 2-component tuples first component the region, 
                                             second component the ratio of white pixels
                                             
        k                     int            number of regions to select
    """
    regions = [x for x in regions if x[3] > 180 and x[4] > 180]
    k_best_regions = sorted(regions, key=lambda tup: tup[2])[:k]
    return k_best_regions

def get_k_best_regions(coordinates, image, window_size=512):
    regions = {}
    for i, tup in enumerate(coordinates):
        x, y = tup[0], tup[1]
        regions[i] = image[x : x+window_size, y : y+window_size, :]
    
    return regions

def generate_patches(image, window_size=200, stride=128, k=20):
    
    #image = skimage.io.MultiImage(slide_path)[-2]
    image = np.array(image)
    
    max_width, max_height = image.shape[0], image.shape[1]
    regions_container = []
    i = 0
    
    while window_size + stride*i <= max_height:
        j = 0
        
        while window_size + stride*j <= max_width:            
            x_top_left_pixel = j * stride
            y_top_left_pixel = i * stride
            
            patch = image[
                x_top_left_pixel : x_top_left_pixel + window_size,
                y_top_left_pixel : y_top_left_pixel + window_size,
                :
            ]
            
            ratio_white_pixels, green_concentration, blue_concentration = compute_statistics(patch)
            
            region_tuple = (x_top_left_pixel, y_top_left_pixel, ratio_white_pixels, green_concentration, blue_concentration)
            regions_container.append(region_tuple)
            
            j += 1
        
        i += 1
    
    k_best_region_coordinates = select_k_best_regions(regions_container, k=k)
    k_best_regions = get_k_best_regions(k_best_region_coordinates, image, window_size)
    
    return image, k_best_region_coordinates, k_best_regions


def display_images(regions, title):
    fig, ax = plt.subplots(5, 4, figsize=(15, 15))
    
    for i, region in regions.items():
        ax[i//4, i%4].imshow(region)
    
    fig.suptitle(title)
    
def glue_to_one_picture(image_patches, window_size=200, k=16):
    side = int(np.sqrt(k))
    image = np.zeros((side*window_size, side*window_size, 3), dtype=np.int16)
        
    for i, patch in image_patches.items():
        x = i // side
        y = i % side
        image[
            x * window_size : (x+1) * window_size,
            y * window_size : (y+1) * window_size,
            :
        ] = patch
    
    return image



In [None]:
import os
import cv2
import skimage.io
from tqdm.notebook import tqdm
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc

from sklearn.metrics import cohen_kappa_score , confusion_matrix
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

features = pd.read_csv("/kaggle/input/featuresv3/LBP_CCA_features_on_train_images_upsampled.csv")
train = pd.read_csv("/kaggle/input/prostate-cancer-grade-assessment/train.csv")
test = pd.read_csv("/kaggle/input/prostate-cancer-grade-assessment/test.csv")
train.head()
sz = 128
N=16

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 5]
import numpy
import pandas as pd
import numpy as np
import cv2
from skimage import morphology
import openslide
import time
import statistics
def otsu_filter(channel, gaussian_blur=True):
    """Otsu filter."""
    if gaussian_blur:
        channel = cv2.GaussianBlur(channel, (5, 5), 0)
    channel = channel.reshape((channel.shape[0], channel.shape[1]))

    return cv2.threshold(
        channel, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

def detect_tissue(wsi, sensitivity = 3000, downsampling_factor=64):
    
    """
    Find RoIs containing tissue in WSI.
    Generate mask locating tissue in an WSI. Inspired by method used by
    Wang et al. [1]_.
    .. [1] Dayong Wang, Aditya Khosla, Rishab Gargeya, Humayun Irshad, Andrew
    H. Beck, "Deep Learning for Identifying Metastatic Breast Cancer",
    arXiv:1606.05718
    
    Parameters
    ----------
    wsi: OpenSlide/AnnotatedOpenSlide class instance
        The whole-slide image (WSI) to detect tissue in.
    downsampling_factor: int
        The desired factor to downsample the image by, since full WSIs will
        not fit in memory. The image's closest level downsample is found
        and used.
    sensitivity: int
        The desired sensitivty of the model to detect tissue. The baseline is set
        at 5000 and should be adjusted down to capture more potential issue and
        adjusted up to be more agressive with trimming the slide.
        
    Returns
    -------
    -Binary mask as numpy 2D array, 
    -RGB slide image (in the used downsampling level, in case the user is visualizing output examples),
    -Downsampling factor.
    """
    #For timing
    time_stamps = {}
    time_stamps["start"] = time.time()
    
    # Get a downsample of the whole slide image (to fit in memory)
    downsampling_factor = min(
        wsi.level_downsamples, key=lambda x: abs(x - downsampling_factor))
    level = wsi.level_downsamples.index(downsampling_factor)

    slide = wsi.read_region((0, 0), level, wsi.level_dimensions[level])
    slide = np.array(slide)[:, :, :3]
    time_stamps["1"] = time.time()
    # Convert from RGB to HSV color space
    slide_hsv = cv2.cvtColor(slide, cv2.COLOR_BGR2HSV)
    time_stamps["2"] = time.time()
    # Compute optimal threshold values in each channel using Otsu algorithm
    _, saturation, _ = np.split(slide_hsv, 3, axis=2)

    mask = otsu_filter(saturation, gaussian_blur=True)
    time_stamps["3"] = time.time()
    # Make mask boolean
    mask = mask != 0

    mask = morphology.remove_small_holes(mask, area_threshold=sensitivity)
    mask = morphology.remove_small_objects(mask, min_size=sensitivity)
    time_stamps["4"] = time.time()
    mask = mask.astype(np.uint8)
    mask_contours, tier = cv2.findContours(
        mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    time_stamps["5"] = time.time()
    time_stamps = {key:(value-time_stamps["start"]) * 1000 for key,value in time_stamps.items()}
    return mask_contours, tier, slide, downsampling_factor, time_stamps

def draw_tissue_polygons(mask, polygons, polygon_type,
                              line_thickness=None):
        """
        Plot as numpy array detected tissue.
        Modeled WSIPRE github package
        
        Parameters
        ----------
        mask: numpy array 
            This is the original image represented as 0's for a starting canvas
        polygons: numpy array 
            These are the identified tissue regions
        polygon_type: str ("line" | "area")
            The desired display type for the tissue regions
        polygon_type: int
            If the polygon_type=="line" then this parameter sets thickness

        Returns
        -------
        Nunmpy array of tissue mask plotted
        """
        
        tissue_color = 1

        for poly in polygons:
            if polygon_type == 'line':
                mask = cv2.polylines(
                    mask, [poly], True, tissue_color, line_thickness)
            elif polygon_type == 'area':
                if line_thickness is not None:
                    warnings.warn('"line_thickness" is only used if ' +
                                  '"polygon_type" is "line".')

                mask = cv2.fillPoly(mask, [poly], tissue_color)
            else:
                raise ValueError(
                    'Accepted "polygon_type" values are "line" or "area".')

        return mask

def tissue_cutout_old(tissue_slide, tissue_contours, slide):
    #https://stackoverflow.com/a/28759496
    crop_mask = np.zeros_like(tissue_slide) # Create mask where white is what we want, black otherwise
    cv2.drawContours(crop_mask, tissue_contours, -1, 255, -1) # Draw filled contour in mask
    tissue_only = np.zeros_like(slide) # Extract out the object and place into output image
    tissue_only[crop_mask == 255] = slide[crop_mask == 255]
    return tissue_only

def tissue_cutout(input_slide, tissue_contours):
    
    """
    Description
    ----------
    Set all parts of the in_slide to black except for those
    within the provided tissue contours
    Credit: https://stackoverflow.com/a/28759496
    
    Parameters
    ----------
    input_slide: numpy array
            Slide to cut non-tissue backgound out
    tissue_contours: numpy array 
            These are the identified tissue regions as cv2 contours
            
    Returns (1)
    -------
    - Numpy array of slide with non-tissue set to black
    """
    
    # Get intermediate slide
    base_slide_mask = np.zeros(input_slide.shape[:2])
    
    # Create mask where white is what we want, black otherwise
    crop_mask = np.zeros_like(base_slide_mask) 
    
    # Draw filled contour in mask
    cv2.drawContours(crop_mask, tissue_contours, -1, 255, -1) 
    
    # Extract out the object and place into output image
    tissue_only_slide = np.zeros_like(input_slide)  
    tissue_only_slide[crop_mask == 255] = input_slide[crop_mask == 255]
    
    return tissue_only_slide


def getSubImage(rect, src_img):
    width = int(rect[1][0])
    height = int(rect[1][1])
    box = cv2.boxPoints(rect)

    src_pts = box.astype("float32")
    dst_pts = np.array([[0, height-1],
                        [0, 0],
                        [width-1, 0],
                        [width-1, height-1]], dtype="float32")
    M = cv2.getPerspectiveTransform(src_pts, dst_pts)
    warped = cv2.warpPerspective(src_img, M, (width, height))
    return warped




def detect_and_crop(image_location:str, sensitivity:int=3000, 
                    downsample_rate:int=16, show_plots:str="simple"):
    
    #For timing
    time_stamps = {}
    time_stamps["start"] = time.time()
    
    #Open Slide
    wsi = openslide.open_slide(image_location)
    time_stamps["open"] = time.time()
    
    #Get returns from detect_tissue()
    (tissue_contours, tier, 
     downsampled_slide, 
     downsampling_factor,
     time_stamps_detect) = detect_tissue(wsi,
                                          sensitivity,downsample_rate)
    time_stamps["tissue_detect"] = time.time()
    
    #Get Tissue Only Slide
    base_slide_mask = np.zeros(downsampled_slide.shape[:2])
    tissue_slide = draw_tissue_polygons(base_slide_mask, tissue_contours,'line', 5)
    base_size = get_disk_size(downsampled_slide)
    tissue_only_slide = tissue_cutout(tissue_slide, tissue_contours, downsampled_slide)
    time_stamps["tissue_trim"] = time.time()
    #Get minimal bounding rectangle for all tissue contours
    if len(tissue_contours) == 0:
        img_id = image_location.split("/")[-1]
        print(f"No Tissue Contours - ID: {img_id}")
        #downsampling_factor = min(
        #wsi.level_downsamples, key=lambda x: abs(x - downsample_rate))
        #level = wsi.level_downsamples.index(downsampling_factor)
        #slide = wsi.read_region((0, 0), level, wsi.level_dimensions[level])
        #slide = np.array(slide)
        return tissue_only_slide, 1.0, time_stamps
    
    all_bounding_rect = cv2.minAreaRect(np.concatenate(tissue_contours))
    #Crop with getSubImage()
    smart_bounding_crop = getSubImage(all_bounding_rect,tissue_only_slide)
    time_stamps["crop"] = time.time()
    
    #Crop empty space
    #Remove by row
    row_not_blank =  [row.all() for row in ~np.all(smart_bounding_crop == [255,0,0],
                                                   axis=1)]
    space_cut = smart_bounding_crop[row_not_blank,:]
    #Remove by column
    col_not_blank =  [col.all() for col in ~np.all(smart_bounding_crop == [255,0,0],
                                                   axis=0)]
    space_cut = space_cut[:,col_not_blank]
    time_stamps["cut"] = time.time()
    
    #Get size change
    start_size = get_disk_size(downsampled_slide)
    final_size = get_disk_size(space_cut)
    pct_change = final_size / start_size
    
    if show_plots == "simple":
        print(f"Percent Reduced from Base Slide to Final: {(1- pct_change)*100:.2f}")
        plt.imshow(space_cut)
        plt.show() 
    elif show_plots == "verbose":
        #Set-up dictionary for plotting
        verbose_plots = {}
        #Add Base Slide to verbose print
        verbose_plots[f"Base Slide\n{get_disk_size(downsampled_slide):.2f}MB"] = downsampled_slide
        #Add Tissue Only to verbose print
        verbose_plots[f"Tissue Detect\nNo Change"] = tissue_slide
        #Add Bounding Boxes to verbose print
        verbose_plots[f"Bounding Boxes\n{get_disk_size(smart_bounding_crop):.2f}MB"] = smart_bounding_crop
        #Add Space Cut Boxes to verbose print
        verbose_plots[f"Space Cut\n{get_disk_size(space_cut):.2f}MB"] = space_cut
        print(f"Percent Reduced from Base Slide to Final: {(1- pct_change)*100:.2f}")
        plt = plot_figures(verbose_plots, 1, len(verbose_plots))
        plt.show()
    elif show_plots == "none":
        pass
    else:
        pass
    time_stamps["all"] = time.time()
    time_stamps = {key:(value-time_stamps["start"]) * 1000 for key,value in time_stamps.items()}
    return space_cut, (1-pct_change), time_stamps

def get_disk_size(numpy_image):
    """ Returns size in MB of numpy array on disk."""
    return (numpy_image.size * numpy_image.itemsize) / 1000000

def plot_figures(figures, nrows = 1, ncols=1):
    #https://stackoverflow.com/a/11172032
    """Plot a dictionary of figures.

    Parameters
    ----------
    figures : <title, figure> dictionary
    ncols : number of columns of subplots wanted in the display
    nrows : number of rows of subplots wanted in the figure
    """

    fig, axeslist = plt.subplots(ncols=ncols, nrows=nrows)
    for ind,title in enumerate(figures):
        axeslist.ravel()[ind].imshow(figures[title], aspect='auto')
        axeslist.ravel()[ind].set_title(title)
    plt.tight_layout()
    return plt

def remove_pen_marks(img):
    
    # Define elliptic kernel
    kernel5x5 = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
    
    # Convert image to gray scale and mask out background
    img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    img_mask = np.where(img_gray > 45, 1, 0).astype(np.uint8)
    
    # Reshape red channel into 1-d array, aims to mask most of the pen marks
    img_r = np.reshape(img[:, :, 0], (-1,))
    img_r = img_r[np.where(img_r >0)[0]]
    img_r_mask = (img[:, :, 0] > np.median(img_r)-50).astype(np.uint8)

    # When computing the pen mark mask, some tissue gets masked as well,
    # thus needing to erode the mask to get rid of it. Then some dilatation is 
    # applied to capture the "edges" of the "gradient-like"/non-uniform pen marks
    img_r_mask = cv2.erode(img_r_mask, kernel5x5, iterations=3)
    img_r_mask = cv2.dilate(img_r_mask, kernel5x5, iterations=5)
    
    # Combine the two masks
    img_r_mask = img_r_mask
    img_mask = img_mask * img_r_mask
    
    # There might still be some gaps/holes in the tissue, here's an attempt to 
    # fill those gaps/holes
    img_mask = cv2.morphologyEx(img_mask, cv2.MORPH_CLOSE, kernel5x5, iterations=1)
    img_mask = cv2.dilate(img_mask, kernel5x5, iterations=1)
    contours, _ = cv2.findContours(img_mask, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
    for contour in contours:
        cv2.drawContours(img_mask, [contour], 0, 1, -1)
    
    # Some final touch
    img_mask = cv2.erode(img_mask, kernel5x5, iterations=3)
    img_mask = cv2.dilate(img_mask, kernel5x5, iterations=1)
    img_mask = cv2.erode(img_mask, kernel5x5, iterations=2)
    
    # Mask out pen marks from original image
    img = img * img_mask[:, :, np.newaxis]
    
    return img

def hsv_level_changes(img, level1,level2,level3):
    hsv = img.copy()
    hsv = cv2.cvtColor(hsv, cv2.COLOR_RGB2HSV)
    (h, s, v) = cv2.split(hsv)
    s = s*level2
    s = np.clip(s,0,255)
    h = h*level1
    h = np.clip(h,0,255)
    v = v*level3
    v = np.clip(v,0,255)
    hsv = cv2.merge([h,s,v])
    hsv_img = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)
    shape = hsv_img.shape
    pad0 = (sz-shape[0]%sz)%sz  #### horizontal padding
    pad1 = (sz-shape[1]%sz)%sz  #### vartical padding
    hsv_img = np.pad(hsv_img,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2],[0,0]],constant_values=0)
        
    hsv_img = hsv_img.reshape(hsv_img.shape[0]//sz,sz,hsv_img.shape[1]//sz,sz,3)
    hsv_img = hsv_img.transpose(0,2,1,3,4)
    hsv_img = hsv_img.reshape(-1,sz,sz,3)
    if len(hsv_img) < N:
        hsv_img = np.pad(hsv_img,[[0,N-len(hsv_img)],[0,0],[0,0],[0,0]],constant_values=0)
 
    idxs = np.argsort(hsv_img.reshape(hsv_img.shape[0],-1).sum(-1))[:N]
    hsv_img = hsv_img[idxs]
    hsv_img = (hsv_img/255.0).reshape(-1,3)
        
        
    #r_g_b_sd_product.append(hsv_img.std(0)[0]*hsv_img.std(0)[1]*hsv_img.std(0)[2])
        
    return (hsv_img.std(0)[0]*hsv_img.std(0)[1]*hsv_img.std(0)[2]),(hsv_img.std(0)[0]+hsv_img.std(0)[1]+hsv_img.std(0)[2])

from skimage import feature
import numpy as np
class LocalBinaryPatterns:
	def __init__(self, numPoints, radius):
		# store the number of points and radius
		self.numPoints = numPoints
		self.radius = radius
	def describe(self, image, eps=1e-7):
		# compute the Local Binary Pattern representation
		# of the image, and then use the LBP representation
		# to build the histogram of patterns
		lbp = feature.local_binary_pattern(image, self.numPoints,
			self.radius, method="uniform")
		(hist, _) = np.histogram(lbp.ravel(),
			bins=np.arange(0, self.numPoints + 3),
			range=(0, self.numPoints + 2))
		# normalize the histogram
		hist = hist.astype("float")
		hist /= (hist.sum() + eps)
		# return the histogram of Local Binary Patterns
		return hist


In [None]:
def detect_tissue_external(input_slide, sensitivity=3000):
    
    """
    Description
    ----------
    Find RoIs containing tissue in WSI and only return the external most.
    Generate mask locating tissue in an WSI. Inspired by method used by
    Wang et al. [1]_.
    .. [1] Dayong Wang, Aditya Khosla, Rishab Gargeya, Humayun Irshad, Andrew
    H. Beck, "Deep Learning for Identifying Metastatic Breast Cancer",
    arXiv:1606.05718
    Credit: Github-wsipre
    
    Parameters
    ----------
    input_slide: numpy array
        Slide to detect tissue on.
    sensitivity: int
        The desired sensitivty of the model to detect tissue. The baseline is set
        at 3000 and should be adjusted down to capture more potential issue and
        adjusted up to be more agressive with trimming the slide.
        
    Returns (3)
    -------
    -Tissue binary mask as numpy 2D array, 
    -Tiers investigated,
    -Time Stamps from running tissue detection pipeline
    """
    
    # For timing
    time_stamps = {}
    time_stamps["start"] = time.time()

    # Convert from RGB to HSV color space
    slide_hsv = cv2.cvtColor(input_slide, cv2.COLOR_BGR2HSV)
    time_stamps["re-color"] = time.time()
    # Compute optimal threshold values in each channel using Otsu algorithm
    _, saturation, _ = np.split(slide_hsv, 3, axis=2)

    mask = otsu_filter(saturation, gaussian_blur=True)
    time_stamps["filter"] = time.time()
    # Make mask boolean
    mask = mask != 0

    mask = morphology.remove_small_holes(mask, area_threshold=sensitivity)
    mask = morphology.remove_small_objects(mask, min_size=sensitivity)
    time_stamps["morph"] = time.time()
    mask = mask.astype(np.uint8)
    mask_contours, tier = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    time_stamps["contour"] = time.time()
    time_stamps = {
        key: (value - time_stamps["start"]) * 1000 for key, value in time_stamps.items()
    }
    return mask_contours, tier, time_stamps

def color_cut(in_slide, color = [255,255,255]):
    
    """
    Description
    ----------
    Take a input image and remove all rows or columns that
    are only made of the input color [R,G,B]. The default color
    to cut from image is white.
    
    Parameters
    ----------
    input_slide: numpy array 
        Slide to cut white cols/rows 
    color: list
        List of [R,G,B] pixels to cut from the input slide
    
    Returns (1)
    -------
    - Numpy array of input_slide with white removed
    """
    #Remove by row
    row_not_blank = [row.all() for row in ~np.all(in_slide == color, axis=1)]
    output_slide = in_slide[row_not_blank, :]
    
    #Remove by col
    col_not_blank = [col.all() for col in ~np.all(output_slide == color, axis=0)]
    output_slide = output_slide[:, col_not_blank]
    return output_slide


In [None]:
# import the necessary packages
from skimage import feature
import numpy as np
from scipy.stats import kurtosis
from scipy.stats import skew
from statistics import *
class LocalBinaryPatterns:
	def __init__(self, numPoints, radius):
		# store the number of points and radius
		self.numPoints = numPoints
		self.radius = radius
	def describe(self, image, eps=1e-7):
		# compute the Local Binary Pattern representation
		# of the image, and then use the LBP representation
		# to build the histogram of patterns
		lbp = feature.local_binary_pattern(image, self.numPoints,
			self.radius, method="uniform")
		(hist, _) = np.histogram(lbp.ravel(),
			bins=np.arange(0, self.numPoints + 3),
			range=(0, self.numPoints + 2))
		# normalize the histogram
		hist = hist.astype("float")
		hist /= (hist.sum() + eps)
		hist[hist == 0] = 1
		entropy = -sum(hist*np.log(hist))
		skewness_val = skew(lbp.ravel())
		kurtosis_val = kurtosis(lbp.ravel())
		mean_val = mean(lbp.ravel())
		sd_val = np.std(lbp.ravel())
		# return the histogram of Local Binary Patterns
		return entropy, skewness_val, kurtosis_val, mean_val, sd_val
    
    

In [None]:
def new_detect_and_crop(image_location="",sensitivity: int = 3000, downsample_lvl = -1,
                        show_plots= "simple", out_lvl=-2, shape=(512,512)):
    """
    Description
    ----------
    This method performs the pipeline as described in the notebook:
    https://www.kaggle.com/dannellyz/panda-tissue-detect-scaling-bounding-boxes-fast
    
    Parameters
    ----------
    image_location:str
        Location of the slide image to process
    sensitivity:int
        The desired sensitivty of the model to detect tissue. The baseline is set
        at 3000 and should be adjusted down to capture more potential issue and
        adjusted up to be more agressive with trimming the slide.
    downsample_lvl: int
        The level at which to downsample the slide. This can be referenced in
        reverse order to access the lowest resoltuion items first.
        [-1] = lowest resolution
        [0] = highest resolution
    show_plots: str (verbose|simple|none)
        The types of plots to display:
            - verbose - show all steps of process
            - simple - show only last step
            - none - show none of the plots
    out_lvl: int
        The level at which the final slide should sample at. This can be referenced in
        reverse order to access the lowest resoltuion items first.
        [-1] = lowest resolution
        [0] = highest resolution
    shape: touple
        (height, width) of the desired produciton(prod) image
        
    Returns (4)
    -------
    - Numpy array of final produciton(prod) slide
    - Percent memory reduciton from original slide
    - Time stamps from stages of the pipeline
    - Time stamps from the Tissue Detect pipeline
    """
    # For timing
    time_stamps = {}
    time_stamps["start"] = time.time()

    # Open Small Slide
    wsi_small = skimage.io.MultiImage(image_location)[downsample_lvl]
    time_stamps["open_small"] = time.time()

    # Get returns from detect_tissue() ons mall image
    (   tissue_contours,
        tier,
        time_stamps_detect,
    ) = detect_tissue_external(wsi_small, sensitivity)
    
    base_slide_mask = np.zeros(wsi_small.shape[:2])
    
    # Get minimal bounding rectangle for all tissue contours
    if len(tissue_contours) == 0:
        tissue_slide = draw_tissue_polygons(base_slide_mask, tissue_contours,'line', 5)
        base_size = get_disk_size(wsi_small)
        tissue_only_slide = tissue_cutout_old(tissue_slide, tissue_contours, wsi_small)
    
        img_id = image_location.split("/")[-1]
        print(f"No Tissue Contours - ID: {img_id}")
        return tissue_only_slide, 1.0, time_stamps,time_stamps_detect
        #return None, 0, None, None
    
    # Open Big Slide
    wsi_big = skimage.io.MultiImage(image_location)[out_lvl]
    time_stamps["open_big"] = time.time()
    
    #Get small boudning rect and scale
    bounding_rect_small = cv2.minAreaRect(np.concatenate(tissue_contours))
    #print(bounding_rect_small)
    
    # Scale Rectagle to larger image
    scale = int(wsi_big.shape[0] / wsi_small.shape[0])
    #print(scale)
    scaled_rect = (
        (bounding_rect_small[0][0] * scale, bounding_rect_small[0][1] * scale),
        (bounding_rect_small[1][0] * scale, bounding_rect_small[1][1] * scale),
        bounding_rect_small[2],
    )
    # Crop bigger image with getSubImage()
    #print(scaled_rect)
    
    scaled_crop = getSubImage(scaled_rect, wsi_big)
    time_stamps["scale_bounding"] = time.time()
    
    #Cut out white
    white_cut = color_cut(scaled_crop)
    time_stamps["white_cut_big"] = time.time()
    
    #Scale
    scaled_slide = cv2.resize(white_cut, shape)
    time_stamps["resize_big"] = time.time()
    
    # Get returns from detect_tissue() on small image
    (   tissue_contours_big,
        tier_big,
        time_stamps_detect,
    ) = detect_tissue_external(scaled_slide, sensitivity)
    prod_slide = tissue_cutout(scaled_slide, tissue_contours_big)
    time_stamps["remove_tissue"] = time.time()

    # Get size change
    base_size_high = get_disk_size(wsi_big)
    final_size = get_disk_size(prod_slide)
    pct_change = final_size / base_size_high
    
    if show_plots == "simple":
        print(f"Percent Reduced from Base Slide to Final: {(1- pct_change)*100:.2f}")
        plt.imshow(smart_bounding_crop)
        plt.show()
    elif show_plots == "verbose":
        # Set-up dictionary for plotting
        verbose_plots = {}
        # Add Base Slide to verbose print
        verbose_plots[f"Smaller Slide\n{get_disk_size(wsi_small):.2f}MB"] = wsi_small
        # Add Tissue Only to verbose print
        verbose_plots[f"Tissue Detect Low\nNo Change"] = wsi_big
        # Add Larger Plot cut with bounding boxes
        verbose_plots[f"Larger scaled\n{get_disk_size(scaled_crop):.2f}MB"] = scaled_crop
        # Add Bounding Boxes to verbose print
        verbose_plots[
            f"Final Produciton\n{get_disk_size(prod_slide):.2f}MB"
        ] = prod_slide
        print(f"Percent Reduced from Base Slide to Final: {(1- pct_change)*100:.2f}")
        plt = plot_figures(verbose_plots, 2, 2)
    elif show_plots == "none":
        pass
    else:
        pass
    time_stamps = {
        key: (value - time_stamps["start"]) * 1000 for key, value in time_stamps.items()
    }
    return prod_slide, (1 - pct_change), time_stamps, time_stamps_detect


In [None]:
# import the necessary packages
import sys
import numpy as np
import skimage.color
import skimage.filters
import skimage.io
import skimage.viewer
import skimage.measure
import skimage.color


def connected_components(img, sigma, threshold):
    # blur and grayscale before thresholding
    blur = skimage.filters.gaussian(img, sigma=sigma)

    # perform inverse binary thresholding
    mask = blur < threshold

    # Perform CCA on the mask
    labeled_image = skimage.measure.label(mask, connectivity=2, return_num=True)
    
    #metric = (labeled_image[1])/(labeled_image[0].shape[0]*labeled_image[0].shape[1])
    #metric = (labeled_image[0]==labeled_image[1]).sum()
    metric = []
    number_of_objects_area_L_10 = []
    for j in list(range(0,labeled_image[1])):
        metric.append((labeled_image[0]==j).sum())
        
    count = 0
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0
    count5 = 0
    count6 = 0
    count7 = 0
    for k in metric:
        if(k<5):
            count = count+1
        elif(k<10):
            count1 = count1+1
        elif(k<50):
            count2 = count2+1
        elif(k<100):
            count3 = count3+1
        elif(k<500):
            count4 = count4+1
        elif(k<1000):
            count5 = count5+1
        elif(k<5000):
            count6 = count6+1
        else:
            count7 = count7+1
            
    metric.sort(reverse = True)
    count8 = 0
    if(count7==1):
        if(len(metric)>1):
            count8 = metric[1]
        #print("ok")
    elif(count7==2):
        if(len(metric)>2):
            count8 = metric[2]
        #print("ok2")
    number_of_objects_area_L_10 = count,count1,count2,count3,count4,count5,count6,count7,count8
    #viewer = skimage.viewer.ImageViewer(labeled_image)
    #viewer.show()
    #return labeled_image[1]
    return number_of_objects_area_L_10



In [None]:
from sklearn.svm import LinearSVC
#from imutils import paths
import argparse
import cv2
import os
from skimage import color

def feature_engineering_lbp_cca(data = train , dir_name = "train_images"):
    desc = LocalBinaryPatterns(8, 1)
    LBP_hist_entropy = []
    number_of_objects_10_1 = []
    number_of_objects_50_1 = []
    number_of_objects_200_1 = []
    number_of_objects_10_11 = []
    number_of_objects_50_11 = []
    number_of_objects_200_11 = []
    number_of_objects_10_21 = []
    number_of_objects_50_21 = []
    number_of_objects_200_21 = []
    number_of_objects_10_31 = []
    number_of_objects_50_31 = []
    number_of_objects_200_31 = []
    number_of_objects_10_41 = []
    number_of_objects_50_41 = []
    number_of_objects_200_41 = []
    number_of_objects_10_51 = []
    number_of_objects_50_51 = []
    number_of_objects_200_51 = []
    number_of_objects_10_61 = []
    number_of_objects_50_61 = []
    number_of_objects_200_61 = []
    number_of_objects_10_71 = []
    number_of_objects_50_71 = []
    number_of_objects_200_71 = []
    number_of_white_dots = []
    # credits to Rohit Singh
    pen_marked_images = [
        'fd6fe1a3985b17d067f2cb4d5bc1e6e1',
        'ebb6a080d72e09f6481721ef9f88c472',
        'ebb6d5ca45942536f78beb451ee43cc4',
        'ea9d52d65500acc9b9d89eb6b82cdcdf',
        'e726a8eac36c3d91c3c4f9edba8ba713',
        'e90abe191f61b6fed6d6781c8305fe4b',
        'fd0bb45eba479a7f7d953f41d574bf9f',
        'ff10f937c3d52eff6ad4dd733f2bc3ac',
        'feee2e895355a921f2b75b54debad328',
        'feac91652a1c5accff08217d19116f1c',
        'fb01a0a69517bb47d7f4699b6217f69d',
        'f00ec753b5618cfb30519db0947fe724',
        'e9a4f528b33479412ee019e155e1a197',
        'f062f6c1128e0e9d51a76747d9018849',
        'f39bf22d9a2f313425ee201932bac91a',
    ]

    for i in tqdm(data['image_id'].values):
        #img = skimage.io.MultiImage(os.path.join(f"/kaggle/input/prostate-cancer-grade-assessment/{dir_name}"+"/"+str(i)+".tiff"))[2]
        #Set up example slide
        slide_dir = "../input/prostate-cancer-grade-assessment/"
        #annotation_dir = "../input/prostate-cancer-grade-assessment/train_label_masks/"
        #example_id = "0032bfa835ce0f43a92ae0bbab6871cb"
        example_slide = f"{slide_dir}{dir_name}"+"/"+str(i)+".tiff"
        #img, pct_change, time_stamps = detect_and_crop(image_location=example_slide, downsample_rate=4, show_plots="none")
        img, pct_change, time_stamps, detect_time_med = new_detect_and_crop(image_location=example_slide,show_plots="none")
        #url = data_dir + img + '.tiff'
        #image, best_coordinates, best_regions = generate_patches(img, window_size=128, stride=64, k=16)
        #img = glue_to_one_picture(best_regions, window_size=128, k=16)
    
        
        if i in pen_marked_images:
            img = remove_pen_marks(img)
            #plt.imshow(img)
        
        shape = img.shape
        pad0 = (sz-shape[0]%sz)%sz  #### horizontal padding
        pad1 = (sz-shape[1]%sz)%sz  #### vartical padding
        img = np.pad(img,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2],[0,0]],constant_values=0)
        number_of_white_dots.append((img>250).sum()/(img.shape[0]*img.shape[1]))
        #print(number_of_objects[1])
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        LBP_hist_entropy.append(desc.describe(gray))
        #hist = desc.describe(gray)
        #print(hist[0])
        #print(hist)
        #// Set any entries in the PDF that are 0 to 1 so log calculation works
        

        # label and data lists
        #LBP_hist_entropy.append(desc.describe(gray))
        
        img = img.reshape(img.shape[0]//sz,sz,img.shape[1]//sz,sz,3)
        img = img.transpose(0,2,1,3,4).reshape(-1,sz,sz,3)
        if len(img) < N:
            img = np.pad(img,[[0,N-len(img)],[0,0],[0,0],[0,0]],constant_values=0)
        idxs = np.argsort(img.reshape(img.shape[0],-1).sum(-1))[:N]
        img = img[idxs]
        
        #objects_in_box = 0
        #for j in list(range(0,16)):
        #    objects_in_box = objects_in_box + connected_components(img[j], sigma=1, threshold = 0.01)
        img1 = cv2.hconcat([cv2.vconcat([img[0], img[1], img[2], img[3]]), 
                        cv2.vconcat([img[4], img[5], img[6], img[7]]), 
                        cv2.vconcat([img[8], img[9], img[10], img[11]]), 
                        cv2.vconcat([img[12], img[13], img[14], img[15]])])
        img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
        
        #gray = color.rgb2gray(img1)
        number_of_objects_10_1.append(connected_components(img1, sigma=0.1,threshold =0.01))
        number_of_objects_50_1.append(connected_components(img1, sigma=0.5,threshold =0.01))
        number_of_objects_200_1.append(connected_components(img1, sigma=2,threshold =0.01))
        number_of_objects_10_11.append(connected_components(img1, sigma=0.1,threshold =0.11))
        number_of_objects_50_11.append(connected_components(img1, sigma=0.5,threshold =0.11))
        number_of_objects_200_11.append(connected_components(img1, sigma=2,threshold =0.11))
        number_of_objects_10_21.append(connected_components(img1, sigma=0.1,threshold =0.21))
        number_of_objects_50_21.append(connected_components(img1, sigma=0.5,threshold =0.21))
        number_of_objects_200_21.append(connected_components(img1, sigma=2,threshold =0.21))
        number_of_objects_10_31.append(connected_components(img1, sigma=0.1,threshold =0.31))
        number_of_objects_50_31.append(connected_components(img1, sigma=0.5,threshold =0.31))
        number_of_objects_200_31.append(connected_components(img1, sigma=2,threshold =0.31))
        number_of_objects_10_41.append(connected_components(img1, sigma=0.1,threshold =0.41))
        number_of_objects_50_41.append(connected_components(img1, sigma=0.5,threshold =0.41))
        number_of_objects_200_41.append(connected_components(img1, sigma=2,threshold =0.41))
        number_of_objects_10_51.append(connected_components(img1, sigma=0.1,threshold =0.51))
        number_of_objects_50_51.append(connected_components(img1, sigma=0.5,threshold =0.51))
        number_of_objects_200_51.append(connected_components(img1, sigma=2,threshold =0.51))
        number_of_objects_10_61.append(connected_components(img1, sigma=0.1,threshold =0.61))
        number_of_objects_50_61.append(connected_components(img1, sigma=0.5,threshold =0.61))
        number_of_objects_200_61.append(connected_components(img1, sigma=2,threshold =0.61))
        number_of_objects_10_71.append(connected_components(img1, sigma=0.1,threshold =0.71))
        number_of_objects_50_71.append(connected_components(img1, sigma=0.5,threshold =0.71))
        number_of_objects_200_71.append(connected_components(img1, sigma=2,threshold =0.71))
        
        #print(LBP_hist_entropy)
        del img
        gc.collect()
    
    data['number_of_white_dots'] = number_of_white_dots
    data['LBP_hist_entropy'] = [a_tuple[0] for a_tuple in LBP_hist_entropy]
    data['LBP_hist_skewness'] = [a_tuple[1] for a_tuple in LBP_hist_entropy]
    data['LBP_hist_kurtosis'] = [a_tuple[2] for a_tuple in LBP_hist_entropy]
    data['LBP_hist_mean'] = [a_tuple[3] for a_tuple in LBP_hist_entropy]
    data['LBP_hist_sd'] = [a_tuple[4] for a_tuple in LBP_hist_entropy]
    data['number_of_objects_10_1_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_10_1]
    data['number_of_objects_50_1_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_50_1]
    data['number_of_objects_200_1_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_200_1]
    data['number_of_objects_10_11_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_10_11]
    data['number_of_objects_50_11_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_50_11]
    data['number_of_objects_200_11_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_200_11]
    data['number_of_objects_10_21_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_10_21]
    data['number_of_objects_50_21_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_50_21]
    data['number_of_objects_200_21_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_200_21]
    data['number_of_objects_10_31_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_10_31]
    data['number_of_objects_50_31_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_50_31]
    data['number_of_objects_200_31_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_200_31]
    data['number_of_objects_10_41_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_10_41]
    data['number_of_objects_50_41_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_50_41]
    data['number_of_objects_200_41_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_200_41]
    data['number_of_objects_10_51_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_10_51]
    data['number_of_objects_50_51_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_50_51]
    data['number_of_objects_200_51_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_200_51]
    data['number_of_objects_10_61_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_10_61]
    data['number_of_objects_50_61_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_50_61]
    data['number_of_objects_200_61_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_200_61]
    data['number_of_objects_10_71_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_10_71]
    data['number_of_objects_50_71_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_50_71]
    data['number_of_objects_200_71_ALT_5']  = [a_tuple[0] for a_tuple in number_of_objects_200_71]
    data['number_of_objects_10_1_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_10_1]
    data['number_of_objects_50_1_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_50_1]
    data['number_of_objects_200_1_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_200_1]
    data['number_of_objects_10_11_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_10_11]
    data['number_of_objects_50_11_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_50_11]
    data['number_of_objects_200_11_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_200_11]
    data['number_of_objects_10_21_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_10_21]
    data['number_of_objects_50_21_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_50_21]
    data['number_of_objects_200_21_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_200_21]
    data['number_of_objects_10_31_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_10_31]
    data['number_of_objects_50_31_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_50_31]
    data['number_of_objects_200_31_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_200_31]
    data['number_of_objects_10_41_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_10_41]
    data['number_of_objects_50_41_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_50_41]
    data['number_of_objects_200_41_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_200_41]
    data['number_of_objects_10_51_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_10_51]
    data['number_of_objects_50_51_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_50_51]
    data['number_of_objects_200_51_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_200_51]
    data['number_of_objects_10_61_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_10_61]
    data['number_of_objects_50_61_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_50_61]
    data['number_of_objects_200_61_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_200_61]
    data['number_of_objects_10_71_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_10_71]
    data['number_of_objects_50_71_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_50_71]
    data['number_of_objects_200_71_ALT_10']  = [a_tuple[1] for a_tuple in number_of_objects_200_71]
    data['number_of_objects_10_1_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_10_1]
    data['number_of_objects_50_1_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_50_1]
    data['number_of_objects_200_1_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_200_1]
    data['number_of_objects_10_11_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_10_11]
    data['number_of_objects_50_11_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_50_11]
    data['number_of_objects_200_11_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_200_11]
    data['number_of_objects_10_21_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_10_21]
    data['number_of_objects_50_21_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_50_21]
    data['number_of_objects_200_21_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_200_21]
    data['number_of_objects_10_31_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_10_31]
    data['number_of_objects_50_31_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_50_31]
    data['number_of_objects_200_31_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_200_31]
    data['number_of_objects_10_41_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_10_41]
    data['number_of_objects_50_41_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_50_41]
    data['number_of_objects_200_41_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_200_41]
    data['number_of_objects_10_51_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_10_51]
    data['number_of_objects_50_51_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_50_51]
    data['number_of_objects_200_51_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_200_51]
    data['number_of_objects_10_61_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_10_61]
    data['number_of_objects_50_61_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_50_61]
    data['number_of_objects_200_61_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_200_61]
    data['number_of_objects_10_71_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_10_71]
    data['number_of_objects_50_71_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_50_71]
    data['number_of_objects_200_71_ALT_50']  = [a_tuple[2] for a_tuple in number_of_objects_200_71]
    data['number_of_objects_10_1_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_10_1]
    data['number_of_objects_50_1_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_50_1]
    data['number_of_objects_200_1_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_200_1]
    data['number_of_objects_10_11_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_10_11]
    data['number_of_objects_50_11_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_50_11]
    data['number_of_objects_200_11_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_200_11]
    data['number_of_objects_10_21_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_10_21]
    data['number_of_objects_50_21_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_50_21]
    data['number_of_objects_200_21_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_200_21]
    data['number_of_objects_10_31_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_10_31]
    data['number_of_objects_50_31_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_50_31]
    data['number_of_objects_200_31_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_200_31]
    data['number_of_objects_10_41_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_10_41]
    data['number_of_objects_50_41_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_50_41]
    data['number_of_objects_200_41_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_200_41]
    data['number_of_objects_10_51_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_10_51]
    data['number_of_objects_50_51_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_50_51]
    data['number_of_objects_200_51_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_200_51]
    data['number_of_objects_10_61_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_10_61]
    data['number_of_objects_50_61_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_50_61]
    data['number_of_objects_200_61_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_200_61]
    data['number_of_objects_10_71_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_10_71]
    data['number_of_objects_50_71_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_50_71]
    data['number_of_objects_200_71_ALT_100']  = [a_tuple[3] for a_tuple in number_of_objects_200_71]
    data['number_of_objects_10_1_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_10_1]
    data['number_of_objects_50_1_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_50_1]
    data['number_of_objects_200_1_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_200_1]
    data['number_of_objects_10_11_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_10_11]
    data['number_of_objects_50_11_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_50_11]
    data['number_of_objects_200_11_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_200_11]
    data['number_of_objects_10_21_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_10_21]
    data['number_of_objects_50_21_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_50_21]
    data['number_of_objects_200_21_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_200_21]
    data['number_of_objects_10_31_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_10_31]
    data['number_of_objects_50_31_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_50_31]
    data['number_of_objects_200_31_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_200_31]
    data['number_of_objects_10_41_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_10_41]
    data['number_of_objects_50_41_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_50_41]
    data['number_of_objects_200_41_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_200_41]
    data['number_of_objects_10_51_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_10_51]
    data['number_of_objects_50_51_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_50_51]
    data['number_of_objects_200_51_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_200_51]
    data['number_of_objects_10_61_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_10_61]
    data['number_of_objects_50_61_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_50_61]
    data['number_of_objects_200_61_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_200_61]
    data['number_of_objects_10_71_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_10_71]
    data['number_of_objects_50_71_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_50_71]
    data['number_of_objects_200_71_ALT_500']  = [a_tuple[4] for a_tuple in number_of_objects_200_71]
    data['number_of_objects_10_1_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_10_1]
    data['number_of_objects_50_1_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_50_1]
    data['number_of_objects_200_1_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_200_1]
    data['number_of_objects_10_11_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_10_11]
    data['number_of_objects_50_11_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_50_11]
    data['number_of_objects_200_11_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_200_11]
    data['number_of_objects_10_21_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_10_21]
    data['number_of_objects_50_21_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_50_21]
    data['number_of_objects_200_21_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_200_21]
    data['number_of_objects_10_31_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_10_31]
    data['number_of_objects_50_31_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_50_31]
    data['number_of_objects_200_31_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_200_31]
    data['number_of_objects_10_41_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_10_41]
    data['number_of_objects_50_41_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_50_41]
    data['number_of_objects_200_41_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_200_41]
    data['number_of_objects_10_51_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_10_51]
    data['number_of_objects_50_51_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_50_51]
    data['number_of_objects_200_51_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_200_51]
    data['number_of_objects_10_61_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_10_61]
    data['number_of_objects_50_61_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_50_61]
    data['number_of_objects_200_61_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_200_61]
    data['number_of_objects_10_71_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_10_71]
    data['number_of_objects_50_71_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_50_71]
    data['number_of_objects_200_71_ALT_1000']  = [a_tuple[5] for a_tuple in number_of_objects_200_71]
    data['number_of_objects_10_1_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_10_1]
    data['number_of_objects_50_1_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_50_1]
    data['number_of_objects_200_1_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_200_1]
    data['number_of_objects_10_11_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_10_11]
    data['number_of_objects_50_11_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_50_11]
    data['number_of_objects_200_11_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_200_11]
    data['number_of_objects_10_21_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_10_21]
    data['number_of_objects_50_21_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_50_21]
    data['number_of_objects_200_21_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_200_21]
    data['number_of_objects_10_31_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_10_31]
    data['number_of_objects_50_31_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_50_31]
    data['number_of_objects_200_31_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_200_31]
    data['number_of_objects_10_41_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_10_41]
    data['number_of_objects_50_41_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_50_41]
    data['number_of_objects_200_41_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_200_41]
    data['number_of_objects_10_51_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_10_51]
    data['number_of_objects_50_51_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_50_51]
    data['number_of_objects_200_51_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_200_51]
    data['number_of_objects_10_61_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_10_61]
    data['number_of_objects_50_61_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_50_61]
    data['number_of_objects_200_61_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_200_61]
    data['number_of_objects_10_71_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_10_71]
    data['number_of_objects_50_71_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_50_71]
    data['number_of_objects_200_71_ALT_5000']  = [a_tuple[6] for a_tuple in number_of_objects_200_71]
    data['number_of_objects_10_1_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_10_1]
    data['number_of_objects_50_1_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_50_1]
    data['number_of_objects_200_1_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_200_1]
    data['number_of_objects_10_11_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_10_11]
    data['number_of_objects_50_11_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_50_11]
    data['number_of_objects_200_11_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_200_11]
    data['number_of_objects_10_21_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_10_21]
    data['number_of_objects_50_21_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_50_21]
    data['number_of_objects_200_21_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_200_21]
    data['number_of_objects_10_31_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_10_31]
    data['number_of_objects_50_31_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_50_31]
    data['number_of_objects_200_31_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_200_31]
    data['number_of_objects_10_41_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_10_41]
    data['number_of_objects_50_41_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_50_41]
    data['number_of_objects_200_41_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_200_41]
    data['number_of_objects_10_51_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_10_51]
    data['number_of_objects_50_51_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_50_51]
    data['number_of_objects_200_51_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_200_51]
    data['number_of_objects_10_61_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_10_61]
    data['number_of_objects_50_61_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_50_61]
    data['number_of_objects_200_61_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_200_61]
    data['number_of_objects_10_71_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_10_71]
    data['number_of_objects_50_71_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_50_71]
    data['number_of_objects_200_71_biggest_area']  = [a_tuple[7] for a_tuple in number_of_objects_200_71]
    data['number_of_objects_10_1_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_10_1]
    data['number_of_objects_50_1_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_50_1]
    data['number_of_objects_200_1_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_200_1]
    data['number_of_objects_10_11_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_10_11]
    data['number_of_objects_50_11_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_50_11]
    data['number_of_objects_200_11_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_200_11]
    data['number_of_objects_10_21_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_10_21]
    data['number_of_objects_50_21_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_50_21]
    data['number_of_objects_200_21_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_200_21]
    data['number_of_objects_10_31_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_10_31]
    data['number_of_objects_50_31_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_50_31]
    data['number_of_objects_200_31_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_200_31]
    data['number_of_objects_10_41_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_10_41]
    data['number_of_objects_50_41_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_50_41]
    data['number_of_objects_200_41_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_200_41]
    data['number_of_objects_10_51_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_10_51]
    data['number_of_objects_50_51_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_50_51]
    data['number_of_objects_200_51_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_200_51]
    data['number_of_objects_10_61_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_10_61]
    data['number_of_objects_50_61_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_50_61]
    data['number_of_objects_200_61_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_200_61]
    data['number_of_objects_10_71_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_10_71]
    data['number_of_objects_50_71_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_50_71]
    data['number_of_objects_200_71_area_of_FSO']  = [a_tuple[8] for a_tuple in number_of_objects_200_71]
    data['data_prov_ind'] = np.where(data['data_provider'] == "radboud" , 1 , 0)
        
    return data

In [None]:
train_v1 = train[train.image_id!='033e39459301e97e457232780a314ab7']
train_v1 = train_v1[train_v1.image_id!='0b6e34bf65ee0810c1a4bf702b667c88']
train_v1 = train_v1[train_v1.image_id!='3385a0f7f4f3e7e7b380325582b115c9']
train_v1 = train_v1[train_v1.image_id!='3790f55cad63053e956fb73027179707']
train_v1 = train_v1[train_v1.image_id!='5204134e82ce75b1109cc1913d81abc6']
train_v1 = train_v1[train_v1.image_id!='a08e24cff451d628df797efc4343e13c']

train_v1.shape

In [None]:
#train_v2 = feature_engineering_lbp_cca(data = train_v1[0:1] , dir_name = "train_images")
train_v2 = features
train_v2.head(10)

In [None]:
features = ['number_of_white_dots',
'LBP_hist_entropy',
'LBP_hist_skewness',
'LBP_hist_kurtosis',
'LBP_hist_mean',
'LBP_hist_sd',
'number_of_objects_10_1_ALT_5',
'number_of_objects_50_1_ALT_5',
'number_of_objects_200_1_ALT_5',
'number_of_objects_10_11_ALT_5',
'number_of_objects_50_11_ALT_5',
'number_of_objects_200_11_ALT_5',
'number_of_objects_10_21_ALT_5',
'number_of_objects_50_21_ALT_5',
'number_of_objects_200_21_ALT_5',
'number_of_objects_10_31_ALT_5',
'number_of_objects_50_31_ALT_5',
'number_of_objects_200_31_ALT_5',
'number_of_objects_10_41_ALT_5',
'number_of_objects_50_41_ALT_5',
'number_of_objects_200_41_ALT_5',
'number_of_objects_10_51_ALT_5',
'number_of_objects_50_51_ALT_5',
'number_of_objects_200_51_ALT_5',
'number_of_objects_10_61_ALT_5',
'number_of_objects_50_61_ALT_5',
'number_of_objects_200_61_ALT_5',
'number_of_objects_10_71_ALT_5',
'number_of_objects_50_71_ALT_5',
'number_of_objects_200_71_ALT_5',
'number_of_objects_10_1_ALT_10',
'number_of_objects_50_1_ALT_10',
'number_of_objects_200_1_ALT_10',
'number_of_objects_10_11_ALT_10',
'number_of_objects_50_11_ALT_10',
'number_of_objects_200_11_ALT_10',
'number_of_objects_10_21_ALT_10',
'number_of_objects_50_21_ALT_10',
'number_of_objects_200_21_ALT_10',
'number_of_objects_10_31_ALT_10',
'number_of_objects_50_31_ALT_10',
'number_of_objects_200_31_ALT_10',
'number_of_objects_10_41_ALT_10',
'number_of_objects_50_41_ALT_10',
'number_of_objects_200_41_ALT_10',
'number_of_objects_10_51_ALT_10',
'number_of_objects_50_51_ALT_10',
'number_of_objects_200_51_ALT_10',
'number_of_objects_10_61_ALT_10',
'number_of_objects_50_61_ALT_10',
'number_of_objects_200_61_ALT_10',
'number_of_objects_10_71_ALT_10',
'number_of_objects_50_71_ALT_10',
'number_of_objects_200_71_ALT_10',
'number_of_objects_10_1_ALT_50',
'number_of_objects_50_1_ALT_50',
'number_of_objects_200_1_ALT_50',
'number_of_objects_10_11_ALT_50',
'number_of_objects_50_11_ALT_50',
'number_of_objects_200_11_ALT_50',
'number_of_objects_10_21_ALT_50',
'number_of_objects_50_21_ALT_50',
'number_of_objects_200_21_ALT_50',
'number_of_objects_10_31_ALT_50',
'number_of_objects_50_31_ALT_50',
'number_of_objects_200_31_ALT_50',
'number_of_objects_10_41_ALT_50',
'number_of_objects_50_41_ALT_50',
'number_of_objects_200_41_ALT_50',
'number_of_objects_10_51_ALT_50',
'number_of_objects_50_51_ALT_50',
'number_of_objects_200_51_ALT_50',
'number_of_objects_10_61_ALT_50',
'number_of_objects_50_61_ALT_50',
'number_of_objects_200_61_ALT_50',
'number_of_objects_10_71_ALT_50',
'number_of_objects_50_71_ALT_50',
'number_of_objects_200_71_ALT_50',
'number_of_objects_10_1_ALT_100',
'number_of_objects_50_1_ALT_100',
'number_of_objects_200_1_ALT_100',
'number_of_objects_10_11_ALT_100',
'number_of_objects_50_11_ALT_100',
'number_of_objects_200_11_ALT_100',
'number_of_objects_10_21_ALT_100',
'number_of_objects_50_21_ALT_100',
'number_of_objects_200_21_ALT_100',
'number_of_objects_10_31_ALT_100',
'number_of_objects_50_31_ALT_100',
'number_of_objects_200_31_ALT_100',
'number_of_objects_10_41_ALT_100',
'number_of_objects_50_41_ALT_100',
'number_of_objects_200_41_ALT_100',
'number_of_objects_10_51_ALT_100',
'number_of_objects_50_51_ALT_100',
'number_of_objects_200_51_ALT_100',
'number_of_objects_10_61_ALT_100',
'number_of_objects_50_61_ALT_100',
'number_of_objects_200_61_ALT_100',
'number_of_objects_10_71_ALT_100',
'number_of_objects_50_71_ALT_100',
'number_of_objects_200_71_ALT_100',
'number_of_objects_10_1_ALT_500',
'number_of_objects_50_1_ALT_500',
'number_of_objects_200_1_ALT_500',
'number_of_objects_10_11_ALT_500',
'number_of_objects_50_11_ALT_500',
'number_of_objects_200_11_ALT_500',
'number_of_objects_10_21_ALT_500',
'number_of_objects_50_21_ALT_500',
'number_of_objects_200_21_ALT_500',
'number_of_objects_10_31_ALT_500',
'number_of_objects_50_31_ALT_500',
'number_of_objects_200_31_ALT_500',
'number_of_objects_10_41_ALT_500',
'number_of_objects_50_41_ALT_500',
'number_of_objects_200_41_ALT_500',
'number_of_objects_10_51_ALT_500',
'number_of_objects_50_51_ALT_500',
'number_of_objects_200_51_ALT_500',
'number_of_objects_10_61_ALT_500',
'number_of_objects_50_61_ALT_500',
'number_of_objects_200_61_ALT_500',
'number_of_objects_10_71_ALT_500',
'number_of_objects_50_71_ALT_500',
'number_of_objects_200_71_ALT_500',
'number_of_objects_10_1_ALT_1000',
'number_of_objects_50_1_ALT_1000',
'number_of_objects_200_1_ALT_1000',
'number_of_objects_10_11_ALT_1000',
'number_of_objects_50_11_ALT_1000',
'number_of_objects_200_11_ALT_1000',
'number_of_objects_10_21_ALT_1000',
'number_of_objects_50_21_ALT_1000',
'number_of_objects_200_21_ALT_1000',
'number_of_objects_10_31_ALT_1000',
'number_of_objects_50_31_ALT_1000',
'number_of_objects_200_31_ALT_1000',
'number_of_objects_10_41_ALT_1000',
'number_of_objects_50_41_ALT_1000',
'number_of_objects_200_41_ALT_1000',
'number_of_objects_10_51_ALT_1000',
'number_of_objects_50_51_ALT_1000',
'number_of_objects_200_51_ALT_1000',
'number_of_objects_10_61_ALT_1000',
'number_of_objects_50_61_ALT_1000',
'number_of_objects_200_61_ALT_1000',
'number_of_objects_10_71_ALT_1000',
'number_of_objects_50_71_ALT_1000',
'number_of_objects_200_71_ALT_1000',
'number_of_objects_10_1_ALT_5000',
'number_of_objects_50_1_ALT_5000',
'number_of_objects_200_1_ALT_5000',
'number_of_objects_10_11_ALT_5000',
'number_of_objects_50_11_ALT_5000',
'number_of_objects_200_11_ALT_5000',
'number_of_objects_10_21_ALT_5000',
'number_of_objects_50_21_ALT_5000',
'number_of_objects_200_21_ALT_5000',
'number_of_objects_10_31_ALT_5000',
'number_of_objects_50_31_ALT_5000',
'number_of_objects_200_31_ALT_5000',
'number_of_objects_10_41_ALT_5000',
'number_of_objects_50_41_ALT_5000',
'number_of_objects_200_41_ALT_5000',
'number_of_objects_10_51_ALT_5000',
'number_of_objects_50_51_ALT_5000',
'number_of_objects_200_51_ALT_5000',
'number_of_objects_10_61_ALT_5000',
'number_of_objects_50_61_ALT_5000',
'number_of_objects_200_61_ALT_5000',
'number_of_objects_10_71_ALT_5000',
'number_of_objects_50_71_ALT_5000',
'number_of_objects_200_71_ALT_5000',
'number_of_objects_10_1_biggest_area',
'number_of_objects_50_1_biggest_area',
'number_of_objects_200_1_biggest_area',
'number_of_objects_10_11_biggest_area',
'number_of_objects_50_11_biggest_area',
'number_of_objects_200_11_biggest_area',
'number_of_objects_10_21_biggest_area',
'number_of_objects_50_21_biggest_area',
'number_of_objects_200_21_biggest_area',
'number_of_objects_10_31_biggest_area',
'number_of_objects_50_31_biggest_area',
'number_of_objects_200_31_biggest_area',
'number_of_objects_10_41_biggest_area',
'number_of_objects_50_41_biggest_area',
'number_of_objects_200_41_biggest_area',
'number_of_objects_10_51_biggest_area',
'number_of_objects_50_51_biggest_area',
'number_of_objects_200_51_biggest_area',
'number_of_objects_10_61_biggest_area',
'number_of_objects_50_61_biggest_area',
'number_of_objects_200_61_biggest_area',
'number_of_objects_10_71_biggest_area',
'number_of_objects_50_71_biggest_area',
'number_of_objects_200_71_biggest_area',
'number_of_objects_10_1_area_of_FSO',
'number_of_objects_50_1_area_of_FSO',
'number_of_objects_200_1_area_of_FSO',
'number_of_objects_10_11_area_of_FSO',
'number_of_objects_50_11_area_of_FSO',
'number_of_objects_200_11_area_of_FSO',
'number_of_objects_10_21_area_of_FSO',
'number_of_objects_50_21_area_of_FSO',
'number_of_objects_200_21_area_of_FSO',
'number_of_objects_10_31_area_of_FSO',
'number_of_objects_50_31_area_of_FSO',
'number_of_objects_200_31_area_of_FSO',
'number_of_objects_10_41_area_of_FSO',
'number_of_objects_50_41_area_of_FSO',
'number_of_objects_200_41_area_of_FSO',
'number_of_objects_10_51_area_of_FSO',
'number_of_objects_50_51_area_of_FSO',
'number_of_objects_200_51_area_of_FSO',
'number_of_objects_10_61_area_of_FSO',
'number_of_objects_50_61_area_of_FSO',
'number_of_objects_200_61_area_of_FSO',
'number_of_objects_10_71_area_of_FSO',
'number_of_objects_50_71_area_of_FSO',
'number_of_objects_200_71_area_of_FSO',
'data_prov_ind']

In [None]:
#features = ['LBP_hist_sd', 'LBP_hist_skewness', 'number_of_white_dots', 'LBP_hist_entropy',  'LBP_hist_kurtosis' , 'LBP_hist_mean','number_of_objects_50_71_area_of_FSO','data_prov_ind']
#features = ['LBP_hist_sd', 'LBP_hist_skewness', 'number_of_white_dots', 'LBP_hist_entropy',  'LBP_hist_kurtosis' , 'LBP_hist_mean','data_prov_ind','number_of_objects_10_51_ALT_5','number_of_objects_10_41_ALT_5','number_of_objects_50_71_area_of_FSO','number_of_objects_200_61_area_of_FSO',]
#features = ['data_prov_ind','number_of_objects_10_51_ALT_5','number_of_objects_10_41_ALT_5','number_of_objects_50_71_area_of_FSO','number_of_objects_200_61_area_of_FSO']
 #'number_of_objects_10_41_ALT_5', 'number_of_objects_50_71_area_of_FSO',  'number_of_objects_200_61_area_of_FSO','number_of_objects_10_51_ALT_5','data_prov_ind'

In [None]:
def quadratic_weighted_kappa(y_hat, y):
    return cohen_kappa_score(y_hat, y, weights='quadratic')

def QWK(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.rint(preds)
    score = quadratic_weighted_kappa(preds, labels)
    return ("QWK", score, True)

y = train_v2["isup_grade"]
train = train_v2[features]
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=0)

train_dataset = lgb.Dataset(X_train, y_train)
valid_dataset = lgb.Dataset(X_test, y_test)

params = {
            "objective": 'regression',
            "metric": 'rmse',
            "seed": 0,
            "learning_rate": 0.05,
            "boosting": "gbdt",
            "num_leaves": 31,
            "min_data_in_leaf": 300,
            "max_depth": -1,
            }
        
model = lgb.train(
            params=params,
            num_boost_round=15000,
            early_stopping_rounds=200,
            train_set=train_dataset,
            valid_sets=[train_dataset, valid_dataset],
            verbose_eval=100,
            feval=QWK)


preds = model.predict(X_test, num_iteration=model.best_iteration)
preds = np.rint(preds)
preds = np.clip(preds, 0 , 5)

In [None]:
import pandas as pd
importance = model.feature_importance()
print(importance)
print(features)
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = features
fold_importance_df["importance"] =importance
fold_importance_df.head(20)
print(fold_importance_df.shape)
all_features = fold_importance_df.sort_values(by="importance", ascending=False)
all_features.reset_index(inplace=True)
features = list(all_features[0:100]['feature'])
#print(features)
all_features.head(100)

In [None]:
def quadratic_weighted_kappa(y_hat, y):
    return cohen_kappa_score(y_hat, y, weights='quadratic')

def QWK(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.rint(preds)
    score = quadratic_weighted_kappa(preds, labels)
    return ("QWK", score, True)

y = train_v2["isup_grade"]
train = train_v2[features]
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=0)

train_dataset = lgb.Dataset(X_train, y_train)
valid_dataset = lgb.Dataset(X_test, y_test)

params = {
            "objective": 'regression',
            "metric": 'rmse',
            "seed": 0,
            "learning_rate": 0.01,
            "boosting": "gbdt",
            "num_leaves": 31,
            "min_data_in_leaf": 200,
            "max_depth": -1,
            }
        
model = lgb.train(
            params=params,
            num_boost_round=15000,
            early_stopping_rounds=100,
            train_set=train_dataset,
            valid_sets=[train_dataset, valid_dataset],
            verbose_eval=100,
            feval=QWK)


preds = model.predict(X_test, num_iteration=model.best_iteration)
preds = np.rint(preds)
preds = np.clip(preds, 0 , 5)

In [None]:
import pandas as pd
importance = model.feature_importance()
print(importance)
print(features)
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = features
fold_importance_df["importance"] =importance
fold_importance_df.head(50)
print(fold_importance_df.shape)
all_features = fold_importance_df.sort_values(by="importance", ascending=False)
all_features.reset_index(inplace=True)
features = list(all_features[0:50]['feature'])
#print(features)
all_features.head(50)

In [None]:
def quadratic_weighted_kappa(y_hat, y):
    return cohen_kappa_score(y_hat, y, weights='quadratic')

def QWK(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.rint(preds)
    score = quadratic_weighted_kappa(preds, labels)
    return ("QWK", score, True)

y = train_v2["isup_grade"]
train = train_v2[features]
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=0)

train_dataset = lgb.Dataset(X_train, y_train)
valid_dataset = lgb.Dataset(X_test, y_test)

params = {
            "objective": 'regression',
            "metric": 'rmse',
            "seed": 0,
            "learning_rate": 0.01,
            "boosting": "gbdt",
            "num_leaves": 31,
            "min_data_in_leaf": 300,
            "max_depth": 6,
            }
        
model = lgb.train(
            params=params,
            num_boost_round=15000,
            early_stopping_rounds=100,
            train_set=train_dataset,
            valid_sets=[train_dataset, valid_dataset],
            verbose_eval=100,
            feval=QWK)


preds = model.predict(X_test, num_iteration=model.best_iteration)
preds = np.rint(preds)
preds = np.clip(preds, 0 , 5)

In [None]:
model.best_iteration

In [None]:
print("our validation score is" , quadratic_weighted_kappa(preds, y_test))

In [None]:
print(confusion_matrix(preds,y_test))

In [None]:
from __future__ import print_function  
import sys

local_vars = list(locals().items())
for var, obj in local_vars:
    if not var.startswith('_'):
        print(var, sys.getsizeof(obj))

In [None]:
del X_train ,X_test ,y_train ,y_test 
gc.collect()

def inference (da = test , dir_path = "test_images"):
    if os.path.exists(f'../input/prostate-cancer-grade-assessment/{dir_path}'):
        print('run inference')
        
        preds = model.predict(da[features], num_iteration=model.best_iteration)
        preds = np.rint(preds)
        preds = np.clip(preds, 0 ,5)
        da['isup_grade'] = preds.astype(int)
        cols = ["image_id" , "isup_grade"]
        da = da[cols]
        
    return da

train = pd.read_csv("/kaggle/input/prostate-cancer-grade-assessment/train.csv")

sub = inference(da = feature_engineering_lbp_cca(data = train.head(10) , dir_name = "train_images") , dir_path = "train_images")
sub['isup_grade'] = sub['isup_grade'].astype(int)
sub.to_csv('submission.csv', index=False)
sub.head()

if os.path.exists(f'../input/prostate-cancer-grade-assessment/test_images'):
    print("still can not access the test file ?")
    sub = inference(da = feature_engineering_lbp_cca(data = test , dir_name = "test_images") , dir_path = "test_images")
    sub['isup_grade'] = sub['isup_grade'].astype(int)
    sub.to_csv('submission.csv', index=False)
    
else:
    sub = pd.read_csv("/kaggle/input/prostate-cancer-grade-assessment/sample_submission.csv")
    sub.to_csv('submission.csv', index=False)