In [3]:
import numpy as np  
import pandas as pd  
import glob
from PIL import ImageFilter, ImageStat, Image, ImageDraw
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import cv2
from PIL import ImageFilter, ImageStat, Image, ImageDraw
from multiprocessing import Pool, cpu_count
from sklearn.preprocessing import LabelEncoder

#### setting paths
ROOT_DIR = 'data/cervix/'
DIR_TRAIN = ROOT_DIR + "train/**/*.jpg"
DIR_TEST = ROOT_DIR +  "test/test/*.jpg"
DIR_ADDITIONAL_TYPE = ROOT_DIR + "additional_train/**/*.jpg"
DIR_ROI = ROOT_DIR + "roi/**/*.jpg"

#### filenames for saving numpy arrays
train_npy = 'train_basic_data_roi.npy'
train_npy_target = 'train_target_roi.npy'
test_npy = 'test_basic_data_roi.npy'




print('getting dir ...')
#split train /additional into type, image and path
train1= glob.glob(DIR_TRAIN)
train1 = pd.DataFrame([[p.split('/')[-2],p.split('/')[-1],p] for p in train1], columns = ['type','image','path'])
train2= glob.glob(DIR_ADDITIONAL_TYPE)
train2 = pd.DataFrame([[p.split('/')[-2],p.split('/')[-1],p] for p in train2], columns = ['type','image','path'])

# # Train data with ROI
roi= glob.glob(DIR_ROI)
roi = pd.DataFrame([[p.split('/')[-2],p.split('/')[-1],p] for p in roi], columns = ['type','image','path'])
frames_roi = [train1,train2,roi]
train = pd.concat(frames_roi)
print('train with roi ... ',len(train))


test = glob.glob(DIR_TEST)
test = pd.DataFrame([[p.split('/')[-2],p.split('/')[-1],p] for p in test], columns = ['type','image','path'])






def im_multi(path):
    try:
        im_stats_im_ = Image.open(path)
        return [path, {'size': im_stats_im_.size}]
    except:
        print('exception .... ',path)
        return [path, {'size': [0,0]}]

def im_stats(im_stats_df):
    im_stats_d = {}
    p = Pool(cpu_count())
    ret = p.map(im_multi, im_stats_df['path'])
    for i in range(len(ret)):
        im_stats_d[ret[i][0]] = ret[i][1]
    im_stats_df['size'] = im_stats_df['path'].map(lambda x: ' '.join(str(s) for s in im_stats_d[x]['size']))
    return im_stats_df

def get_im_cv2(path):
    img = cv2.imread(path)
    resized = cv2.resize(img, (128, 128), cv2.INTER_LINEAR) #use cv2.resize(img, (64, 64), cv2.INTER_LINEAR)
    return [path, resized]

def normalize_image_features(paths):
    imf_d = {}
    p = Pool(cpu_count())
    ret = p.map(get_im_cv2, paths)
    for i in range(len(ret)):
        imf_d[ret[i][0]] = ret[i][1]
    ret = []
    fdata = [imf_d[f] for f in paths]
    fdata = np.array(fdata, dtype=np.uint8)
    fdata = fdata.transpose((0, 3, 1, 2))
    fdata = fdata.astype('float32')
    fdata = fdata / 255
    return fdata


##### writing train data
print('getting train data ...')
train = im_stats(train)
train = train[train['size'] != '0 0'].reset_index(drop=True) #remove bad images
print('normalizing test data ...')
train_data = normalize_image_features(train['path'])
print('saving train data as ...', train_npy)
np.save(train_npy, train_data, allow_pickle=True, fix_imports=True)


##### writing train labels
print('encoding train labels ...')
le = LabelEncoder()
train_target = le.fit_transform(train['type'].values)
print('label encoding classes for train data ... ', le.classes_) #in case not 1 to 3 order
np.save(train_npy_target, train_target, allow_pickle=True, fix_imports=True)

##### writing test data
print('getting test data ...')
test = im_stats(test)
test = test[test['size'] != '0 0'].reset_index(drop=True) #remove bad images
print('normalizing test data ...')
test_data = normalize_image_features(test['path'])
print('saving train data as ...', test_npy)
np.save(test_npy, test_data, allow_pickle=True, fix_imports=True)






getting dir ...
getting test data ...
normalizing test data ...
('saving train data as ...', 'test_basic_data_roi.npy')


In [5]:
test.tail(10)

Unnamed: 0,type,image,path,size
502,test,423.jpg,data/cervix/test/test/423.jpg,3264 2448
503,test,14.jpg,data/cervix/test/test/14.jpg,2448 3264
504,test,135.jpg,data/cervix/test/test/135.jpg,3096 4128
505,test,74.jpg,data/cervix/test/test/74.jpg,3264 2448
506,test,190.jpg,data/cervix/test/test/190.jpg,3096 4128
507,test,404.jpg,data/cervix/test/test/404.jpg,3096 4128
508,test,438.jpg,data/cervix/test/test/438.jpg,3096 4128
509,test,111.jpg,data/cervix/test/test/111.jpg,2448 3264
510,test,367.jpg,data/cervix/test/test/367.jpg,3096 4128
511,test,361.jpg,data/cervix/test/test/361.jpg,2448 3264
