In [1]:
from PIL import Image
import numpy as np
import os
import re
from utils import process_one_image

Using Theano backend.


In [2]:
def load_data(folder, n_block, image_width, image_height, color_channel = 3):
    '''
    Load the data from folder.
    
    Parameters
    ----------
    :type folder: str 
    :param folder: image folder address

    :type n_block: int
    :param n_block: the blocks to keep around base pair of interest
    
    :type image_width: int
    :param image_width: new image width
 
    :type image_height: int
    :param image_height: new image height

    returns
    ----------
    :type dataset: np.uint8
        image after process, ndarray of dimension: (# of images, channel , width, height)
    '''
    image_files = os.listdir(folder)
    dataset = np.ndarray(shape=(len(image_files), color_channel, image_width, image_height),
                         dtype=np.uint8)  
    image_index = 0
    for image in image_files:
        image_file = os.path.join(folder, image)
        img = process_one_image(image_file, n_block, image_width, image_height)
        if image_index % 500 == 0:
            print image_index, image
            img.show()
        image_data = np.array(img).transpose((2, 0, 1))
        dataset[image_index, :, :, :] = image_data
        image_index += 1
    return dataset


def maybe_npy(data_folders, n_block, image_width, image_height, force=False):
    '''
    save the data to npy.
    
    Parameters
    ----------
    :type data_folders: list 
    :param data_folders: list of folders of images

    :type n_block: int
    :param n_block: the blocks to keep around base pair of interest
    
    :type image_width: int
    :param image_width: new image width
 
    :type image_height: int
    :param image_height: new image height
    
    :type force: boolean
    :param force: whether overwrite
    

    returns
    ----------
    :type dataset_names: list
        saved file name
    '''
        
    dataset_names = []
    for folder in data_folders:
        set_filename = folder + '.npy'
        dataset_names.append(set_filename)
        if os.path.exists(set_filename) and not force:
            # You may override by setting force=True.
            print('%s already present - Skipping npying.' % set_filename)
        else:
            print('npying %s.' % set_filename)
            dataset = load_data(folder, n_block, image_width, image_height)
            try:
                np.save(set_filename, dataset)
            except Exception as e:
                print('Unable to save data to', set_filename, ':', e)
    return dataset_names

In [22]:
n_block = 2
image_width, image_height  = 128, 128
color_channel = 3
data_folders = ['data/fail', 'data/snp', 'data/insertion', 
                'data/deletion','data/complex','data/uncertain']

dataset = maybe_npy(data_folders, n_block, image_width, image_height, True)

npying data/fail.npy.
0 1-00738.bam,1-00738-01.bam,1-00738-02.bam,1.112914859.112914899.png
500 1-02452.bam,1-02452-01.bam,1-02452-02.bam,1.3129216.3129256.png
1000 1-04050.bam,1-04050-01.bam,1-04050-02.bam,21.9972034.9972074.png
1500 1-05517.bam,1-05517-01.bam,1-05517-02.bam,9.140843566.140843606.png
2000 1-06425.bam,1-06425-01.bam,1-06425-02.bam,8.43761484.43761524.png
2500 1-06635.bam,1-06635-01.bam,1-06635-02.bam,17.36308168.36308208.png
3000 1-07417.bam,1-07417-01.bam,1-07417-02.bam,20.24642017.24642057.png
npying data/snp.npy.
0 1-00738.bam,1-00738-01.bam,1-00738-02.bam,1.18591822.18591862.png
500 1-03805.bam,1-03805-01.bam,1-03805-02.bam,16.21053637.21053677.png
1000 1-05538.bam,1-05538-01.bam,1-05538-02.bam,7.120265517.120265557.png
1500 1-06577.bam,1-06577-01.bam,1-06577-02.bam,11.27743113.27743153.png
2000 CDH889_GAAACC_BC41TKACXX_L006_001.R1.bwamem.mkdup.bam,CDH890_AAAGCA_BC41TKACXX_L006_001.R1.bwamem.mkdup.bam,CDH891_ATCACG_BC41TKACXX_L006_001.R1.bwamem.mkdup.bam,7.99023086

In [23]:
all_image_names = []
for folder in data_folders:
    image_files = os.listdir(folder)
    all_image_names.extend(image_files)
all_image_names = np.array(all_image_names)
print len(all_image_names)
print dataset

8523
['data/fail.npy', 'data/snp.npy', 'data/insertion.npy', 'data/deletion.npy', 'data/complex.npy', 'data/uncertain.npy']


In [24]:
name_to_label = {'fail.npy': 0, 'snp.npy':1, 'insertion.npy':2,'deletion.npy':3,
                 'complex.npy':4,'uncertain.npy':5, }

label_to_name = {0:'fail', 1:'snp', 2:'insertion', 3:'deletion', 4:'complex', 5:'uncertain'}

total_n = len(all_image_names)
train = np.ndarray(shape=(total_n, color_channel, image_width, image_height),
                         dtype=np.uint8)
label = np.ndarray((total_n,),dtype="uint8")

# combine npy 
start = 0
for data_name in dataset:
    print data_name
    data = np.load(data_name)
    data = np.array(data, dtype=np.uint8)
    end = data.shape[0]
    data_label = np.repeat(name_to_label[data_name.split('/')[-1]], end)
    train[start:start+end,:,:,:] = data
    label[start:start+end] = data_label
    start += end

# shuffle 
index = [i for i in range(total_n)]
np.random.shuffle(index)
train = train[index]
label = label[index]
all_image_names = all_image_names[index]
print(train.shape[0], ' samples')
print train.shape
print label.shape
np.save('train.npy', train)
np.save('label.npy', label)
np.save('name.npy', all_image_names)

data/fail.npy
data/snp.npy
data/insertion.npy
data/deletion.npy
data/complex.npy
data/uncertain.npy
(8523, ' samples')
(8523, 3, 128, 128)
(8523,)


In [27]:
X = train
X = X.transpose((0, 2, 3, 1))
index = 7
img = Image.fromarray(X[index],'RGB')
img.show()
print label_to_name[label[index]]
print all_image_names[index]

deletion
SRR3879643.bam,SRR3879714.bam,SRR3879700.bam.11.8711648.png


In [21]:
label

array([  0,   0, 156, ...,   0,   0,   0], dtype=uint8)