-
Notifications
You must be signed in to change notification settings - Fork 1
/
image_preprocess.py
244 lines (177 loc) · 9.1 KB
/
image_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
'''
Running this code could take a while, especially done on you laptop.
We would recommend you copy the data from Google Drive to NYU HPC,
change the path declared below and run this.
Inspired from the code given at: https://camelyon17.grand-challenge.org/Data/
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os.path as osp
import glob
from PIL import Image
import openslide
from openslide.deepzoom import DeepZoomGenerator
from pathlib import Path
from scipy.misc import imsave as saveim
from skimage.filters import threshold_otsu
from pandas import HDFStore
import cv2 as cv2
from skimage import io
# If you need more information about how ElementTree package handle XML file, please follow the link:
# https://docs.python.org/3/library/xml.etree.elementtree.html
import xml.etree.ElementTree as et
import pandas as pd
import math
# provide paths to the data files
# NOTE: in our case, each folder only contained a couple of images
slide_path = '/avengers/harmanchawla/Downloads/CAMELYON16/training/tumor'
slide_path_normal = '/avengers/harmanchawla/Downloads/CAMELYON16/training/normal'
anno_path = '/avengers/harmanchawla/Downloads/CAMELYON16/training/Lesion_annotations'
BASE_TRUTH_DIR = '/avengers/harmanchawla/Downloads/CAMELYON16/masking'
slide_paths = glob.glob(osp.join(slide_path, '*.tif'))
slide_paths.sort()
slide_paths_normal = glob.glob(osp.join(slide_path_normal, '*.tif'))
slide_paths_normal.sort()
# slide_paths_total = slide_paths
slide_paths_total = slide_paths + slide_paths_normal
BASE_TRUTH_DIRS = glob.glob(osp.join(BASE_TRUTH_DIR, '*.tif'))
Anno_paths = glob.glob(osp.join(anno_path, '*.xml'))
BASE_TRUTH_DIRS.sort()
def convert_xml_df (file):
'''
The following parses an XML file and converts it into a dataframe.
The XML file contains a set of attributes over which we iterate and
construct (store them) in the dataframe.
This dataframe is later used to generate a mask which is not
provided to use by the organizers.
@Args - accepts path to an XML file as input
@Returns - dataframe coversion
'''
parseXML = et.parse(file)
root = parseXML.getroot()
dfcols = ['Name', 'Order', 'X', 'Y']
new_XML = pd.DataFrame(columns=dfcols)
for item in root.iter('Annotation'):
for coordinate in item.iter('Coordinate'):
Name = item.attrib.get('Name')
Order = coordinate.attrib.get('Order')
x_coordinate = float(coordinate.attrib.get('X'))
Y_coord = float(coordinate.attrib.get('Y'))
new_XML = new_XML.append(pd.Series([Name, Order, x_coordinate, Y_coord], index = dfcols), ignore_index=True)
new_XML = pd.DataFrame(new_XML)
return (new_XML)
# desired cropped size of each patch
crop_size = [256, 256]
i=0
while i < len(slide_paths):
base_truth_dir = Path(BASE_TRUTH_DIR)
anno_path = Path(anno_path)
# checking if the name contains tumor. It is used later to do specific tasks.
slide_contains_tumor = osp.basename(slide_paths_total[i]).startswith('tumor_')
# for each slide for which we have been given a path
with openslide.open_slide(slide_paths_total[i]) as slide:
thumbnail = slide.get_thumbnail((slide.dimensions[0]/ 256, slide.dimensions[1]/ 256))
thum = np.array(thumbnail)
# just converting the color channel. Can also be done using sk Image.
hsv_image = cv2.cvtColor(thum, cv2.COLOR_BGR2HSV)
h, s, v = cv2.split(hsv_image)
# Otsu threshold perfomed on each channel
hthresh = threshold_otsu(h)
sthresh = threshold_otsu(s)
vthresh = threshold_otsu(v)
# be min value for v can be changed later
minhsv = np.array([hthresh, sthresh, 70], np.uint8)
maxhsv = np.array([180, 255, vthresh], np.uint8)
thresh = [minhsv, maxhsv]
#extraction the countor for tissue
rgbbinary = cv2.inRange(hsv_image, thresh[0], thresh[1])
#retriving the contours. We don't care about the other values.
_, contours, _ = cv2.findContours(rgbbinary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# Initializing a DF bboxt which will be used to set the boundaries.
bboxtcols = ['xmin', 'xmax', 'ymin', 'ymax'] # bboxtcols underscores the structure of the df
bboxt = pd.DataFrame(columns=bboxtcols)
for c in contours:
# returns the top-left co-ordinates along with width and height which are used to
# calculate the size and add it to the dataframe
(x, y, w, h) = cv2.boundingRect(c)
bboxt = bboxt.append(pd.Series([x, x+w, y, y+h], index = bboxtcols), ignore_index=True)
bboxt = pd.DataFrame(bboxt)
# each column of the dataframe is stored as a python list
# so that we can the max and min element in each column
xxmin = list(bboxt['xmin'].get_values())
xxmax = list(bboxt['xmax'].get_values())
yymin = list(bboxt['ymin'].get_values())
yymax = list(bboxt['ymax'].get_values())
bboxt = math.floor(np.min(xxmin)*256), math.floor(np.max(xxmax)*256), math.floor(np.min(yymin)*256), math.floor(np.max(yymax)*256)
# if it is a file with tumor in it's name
if slide_contains_tumor:
# get the paths to the mask and annotations
truth_slide_path = base_truth_dir / osp.basename(slide_paths_total[i]).replace('.tif', '_mask.tif')
Anno_pathxml = anno_path / osp.basename(slide_paths_total[i]).replace('.tif', '.xml')
with openslide.open_slide(str(truth_slide_path)) as truth:
slide = openslide.open_slide(slide_paths_total[i])
# convert xml to a temp dataframe
annotations = convert_xml_df(str(Anno_pathxml))
# just as above, get the co-ordinate values and calculate the boundary values
x_values = list(annotations['X'].get_values())
y_values = list(annotations['Y'].get_values())
bbox = math.floor(np.min(x_values)), math.floor(np.max(x_values)), math.floor(np.min(y_values)), math.floor(np.max(y_values))
temp=0
while temp in range(0, 1000):
# r = [rgb_image, rgb_binary, rgb_mask, index]
r=random_crop(slide, truth, thresh, crop_size, bbox)
if (cv2.countNonZero(r[2]) > crop_size[0]*crop_size[1]*0.5) and (temp <= 1000):
saveim('/avengers/harmanchawla/Downloads/test/tumor/%s_%d_%d.png' % (osp.splitext(osp.basename(slide_paths_total[i]))[0], r[3][0], r[3][1]), r[0])
io.imsave('/avengers/harmanchawla/Downloads/test/mask/%s_%d_%d_mask.png' % (osp.splitext(osp.basename(slide_paths_total[i]))[0], r[3][0], r[3][1]), r[2])
print(r[2])
temp = temp +1
else:
temp=0
slide = openslide.open_slide(slide_paths_total[i])
while temp in range(0, 1000):
# nr = [gb_image, rgb_binary, index]
nr=random_crop_normal(slide, thresh, crop_size, bboxt)
if (cv2.countNonZero(r[1]) > crop_size[0]*crop_size[1]*0.2) and (temp <= 1000):
nmask = np.zeros((256, 256))
saveim('/avengers/harmanchawla/Downloads/test/normal/%s_%d_%d.png' % (osp.splitext(osp.basename(slide_paths_total[i]))[0], nr[2][0], nr[2][1]),nr[0])
io.imsave('/avengers/harmanchawla/Downloads/test/nmask/%s_%d_%d_mask.png' % (osp.splitext(osp.basename(slide_paths_total[i]))[0], nr[2][0], nr[2][1]), nmask)
temp= temp+1
i=i+1
# cropping tumor images
def random_crop(slide, truth, thresh, crop_size, bbox):
# it might help to recall what boundary box (bbox) contains:
# min(x_values), max(x_values), min(y_values)), max(y_values)
# crop size = [256, 256]
dy, dx = crop_size
x = np.random.randint(bbox[0], bbox[1] - dx + 1)
y = np.random.randint(bbox[2], bbox[3] - dy + 1)
index=[x, y]
#print(index)
rgb_image = slide.read_region((x, y), 0, crop_size)
rgb_mask = truth.read_region((x, y), 0, crop_size)
rgb_mask = (cv2.cvtColor(np.array(rgb_mask), cv2.COLOR_RGB2GRAY) > 0).astype(int) # convert RGB
rgb_array = np.array(rgb_image) # store as NP array
hsv_rgbimage = cv2.cvtColor(rgb_array, cv2.COLOR_BGR2HSV) # convert HSV
rgb_binary = cv2.inRange(hsv_rgbimage, thresh[0], thresh[1]) # apply thresholds
# print(index)
return (rgb_image, rgb_binary, rgb_mask, index)
# cropping normal images (Pretty much same as above, but We don't need a mask.)
def random_crop_normal(slide, thresh, crop_size, bboxt):
dy, dx = crop_size
x = np.random.randint(bboxt[0], bboxt[1] - dx + 1)
y = np.random.randint(bboxt[2], bboxt[3] - dy + 1)
index=[x, y]
rgb_image = slide.read_region((x, y), 0, crop_size)
rgb_array = np.array(rgb_image)
hsv_rgbimage = cv2.cvtColor(rgb_array, cv2.COLOR_BGR2HSV)
rgb_binary = cv2.inRange(hsv_rgbimage, thresh[0], thresh[1])
return (rgb_image, rgb_binary, index)
def testduplicates(list):
for each in list:
count = list.count(each)
if count > 1:
z = 0
else:
z = 1
return z