# Script created to sort out a few technical objectives

\#|Objective|Status
-|-----------------------------------------|----------------------------------------
1|Verify that I can write _submission.csv_ in correct format|Created dummy _submission.csv,_ which is accepted with a zero score, so I know format is OK
2|Figure out organization of training data|Verified that every _id_ in _train.csv_ matches one image in _train_ folder
3|Verify that I can read training images|Read and display. Seismic colour map looks best
4|Study cell types to determine whether we should segment each type separately. This would entail a classification step where we learn the cell type|On hold: I expect that I'd need to segment first.
5|Parse and plot annotations|Working
6|Downsample images|Plot now shows rescale and downsize
7|Fourier Transform images|TBD


## Set up libraries

In [None]:
from glob              import escape,glob
from matplotlib.pyplot import figure, imread, get_cmap
from numpy             import float32, zeros
from os                import walk
from os.path           import basename, join
from pandas            import read_csv, DataFrame
from random            import sample, seed
from skimage.transform import rescale, resize, downscale_local_mean

## Set up environment variables

In [None]:
DATA_PATH             = '../input/sartorius-cell-instance-segmentation'
TRAIN_PATH            = join(DATA_PATH,'train')
TRAIN_CSV_PATH        = join(DATA_PATH,'train.csv')
TRAIN_SEMI_SUPERVISED = join(DATA_PATH,'train_semi_supervised')
TEST_PATH             = join(DATA_PATH,'test')

## Set up Hyperparameters

In [None]:
VERIFY_CONSISTENCY  = True      # Disable this when we are in production
COLOUR_MAP          = 'seismic' # Established by trial and error
                                # This colour mapo loks best to me - YMMV
RANDOM_SEED         = None      # Used to intialize random number generator
                                # set to None to use current system time
PLOTS_PER_CELL_TYPE = 6         # Number of slids to plt for each cell type


## Initialize random number generator

In [None]:
if RANDOM_SEED==None:
    seed()
else:
    seed(RANDOM_SEED)

## Format of *train.csv*

I have used an asterisk (*) to mark fields that are relevant to training.

Field||Description
-----------------|-|----------------------------------------------------------------------
id|*|Identifies image in training dataset. This is not a primary key, as the annotation may  span multiple records. However, the mapping from _id_ to any field other than annotation is 1 to 1.
annotation|*|Run length encoded. We need to append annotations for all records belongint to one id. E.g. 118145 6 118849 7 119553 8 120257 8 120961 9 121665 10...
width||Width of image in pixels
height||Height of image in pixels
cell_type||_shsy5y_ or _astro_ or _cort_
plate_time||Used with *sample_id* and *sample_date* to link into *train_semi_supervised*
sample_date||Used with *sample_id* and *plate_time* to link into *train_semi_supervised*
sample_id||Links into *train_semi_supervised*, along with *sample_date* and *plate_time*
elapsed_timedelta||




## Read training data

In [None]:
target_data    = read_csv(TRAIN_CSV_PATH)
target_ids     = sorted(target_data.id.unique())
training_ids   = [name.split('.')[0] for _, _, names in walk(TRAIN_PATH) for name in sorted(names) ]

## Construct mapping from id to annotations

In [None]:
def accumulate(id,annotation):
    def flatten(xss):
        return [x for xs in xss for x in xs]
    def parse(annot):
        xs = [int(x) for x in annot.split()]
        return [(xs[i],xs[i+1]) for i in range(0,len(xs),2)]
 
    return (id,sorted(flatten([parse(a) for a in annotation])))

accumulated_data = [accumulate(id,row.annotation) for id,row in target_data.groupby(['id'])]
df_targets       = DataFrame(list(zip([id for id,_ in accumulated_data],
                                     [a for _,a in accumulated_data])),
                    columns=['id','annotations'])


## Test Hypothesis

Every id in train.csv matches one image in train folder. This code block verifies that this is correct.

In [None]:
if VERIFY_CONSISTENCY:
    i = 0
    j = 0
    assert (len(target_ids) ==len(training_ids))
    while i<len(target_ids) and j <len(training_ids):
        if target_ids[i]==training_ids[j]:
            i += 1
            j += 1
        elif target_ids[i]<training_ids[j]:
            print (f'Mismatch {i} {target_ids[i]} {j} {training_ids[j]}')
            i += 1
        else: # target_ids[i]>training_ids[j]
            print (f'Mismatch {i} {target_ids[i]} {j} {training_ids[j]}')
            j += 1
    assert(i==j)
    print (f'All {i} target ids from CSV file match the training images')

## Test Hypothesis

For each id the _train.csv_, all columns share the same value apart from _annotation_

In [None]:
if VERIFY_CONSISTENCY:
    def verify_consistency(id,target_data):
        count_mismatches = 0
        data_for_id      = target_data[target_data.id==id]
        for column in data_for_id.columns:
            if column in ['id','annotation']: continue
            values = data_for_id[column].unique()
            if len(values)>1:
                print (f'Non unique values in {column} for id={id}')
                count_mismatches += 1

        return count_mismatches==0

    consistent = [id for id in target_ids if verify_consistency(id,target_data)]
    assert len(consistent)==len(target_ids)
    print ('Columns are consistent')

## Prepare lists of ids grouped by Cell types

In [None]:
cell_types   = {}
for id,row in target_data.groupby(['id']):
    cell_type = row.cell_type.unique()[0]
    if not cell_type in cell_types:
        cell_types[cell_type]=[]
    cell_types[cell_type].append(id)
 
stats = {}
for cell_type,ids in cell_types.items():
    stats[cell_type] = len(ids)
fig = figure(figsize=(5,5))   
ax  = fig.subplots()
ax.bar(stats.keys(),stats.values())
ax.set_title('Cell Types')


## Sample Ids

I'm trying to work out the way that the files in *train_semi_supervised* are associated with training data. The code below shows that there are many more sample files than training files. I don't think that *train_semi_supervised* will be of any use.

In [None]:
sample_ids   = {}
sample_dates = {}
plate_times  = {}

for id,row in target_data.groupby(['id']):
    sample_ids[id]   = row.sample_id.unique()[0]
    sample_dates[id] = row.sample_date.unique()[0]
    plate_times[id]  = row.plate_time.unique()[0]
    
sample_id_files = [basename(f).split(f)[0] for f in glob(join(TRAIN_SEMI_SUPERVISED,'*.png'))]

print (f'For {len(sample_ids)} training files we have {len(sample_id_files)} sample files')

## Sample Ids (continued)

Find out whether the file specified in sample_id ever matches one in train_semi_supervised, Based on the next code block, it looks as if they
never match.

In [None]:
# get_phase
#
# Extract the last component of a sample file, e.g. 
#
# shsy5y[diff]_E1-4_Vessel-714_2019-06-14_11h30m00s_Ph_3
# has a phase of 3

def get_phase(file_name):
    return int(basename(file_name).split('_')[-1].split('.')[0]) 
 
# get_associated_samples
#
# Given an id, find all accociated sample file
#
# Returns:
#    samples   List of associated sample files for id
#    phase     The phase specified for the id
#    phases    The actual phase of each sample file

def get_associated_samples(id):
    sample_id = sample_ids[id].split('_Ph_')
    phase     = int(sample_id[1])
    template  = f'{escape(sample_id[0])}_{sample_dates[id]}_{plate_times[id]}*.png'
    samples   = glob(join(TRAIN_SEMI_SUPERVISED, template))
    phases    = [get_phase(file_name) for file_name in samples]                
    return samples,phase,phases

def get_matching_phases(id):
    _,phase,phases = get_associated_samples(id)
    return phase in phases

matches = sum([1 for cell_type,ids in cell_types.items() for id in ids if get_matching_phases(id)])
print (f'There are {matches} instances where phase was found in phases')


## Visualize a few images

In [None]:
def get_cell_type(id, target_data):
    return target_data[target_data.id==id].cell_type.unique()[0]

# get_mask
# Inspired by https://www.kaggle.com/xinruizhan/save-seg-image
def get_mask(template,annotation):
    img = zeros((template.shape[0] * template.shape[1], 1), dtype=float32)
    for pos,count in annotation.item():
        for i in range(count):
            img[pos-1+i] = 1
    return img.reshape((template.shape[0], template.shape[1]))

def read_and_display(id):
    image_file_name      = join(TRAIN_PATH,f'{id}.png')
    img                  = imread(image_file_name)
    mask                 = get_mask(img,df_targets.loc[df_targets.id==id]['annotations'])
    image_resized        = resize(img, (img.shape[0] // 4, img.shape[1] // 5),  anti_aliasing=True)
    image_downscaled     = downscale_local_mean(img, (4, 5))
    fig                  = figure(figsize=(20,20))
    ax                   = fig.subplots(nrows=2,ncols=2)
    ax[0][0].imshow(img, 
              cmap   = get_cmap(COLOUR_MAP),
              origin = 'upper',
              vmax   = img.max(),
              vmin   = img.min())
    ax[0][0].set_title(f'{id}, cell type {get_cell_type(id, target_data)}')
    ax[0][1].imshow(mask, 
              cmap   = get_cmap(COLOUR_MAP),
              origin = 'upper',
              vmax   = mask.max(),
              vmin   = mask.min())
    ax[1][0].imshow(image_resized, 
              cmap   = get_cmap(COLOUR_MAP),
              origin = 'upper',
              vmax   = image_resized.max(),
              vmin   = image_resized.min())
    ax[1][0].set_title(f'{id} resized')
    ax[1][1].imshow(image_downscaled, 
              cmap   = get_cmap(COLOUR_MAP),
              origin = 'upper',
              vmax   = image_downscaled.max(),
              vmin   = image_downscaled.min())
    ax[1][1].set_title(f'{id} downscaled')
   

for cell_type,ids in cell_types.items():
    for i in sample(range(len(ids)),PLOTS_PER_CELL_TYPE):
        read_and_display(ids[i])


## Segment data -- currently this is a stub for creating submission file

In [None]:
def segment(keys):
    return [[image_id,'118145 6 118849 7'] for image_id in keys]

## Create Submission file

In [None]:
testing_agenda  = {} 
for dirname, _, filenames in walk(TEST_PATH):
    for filename in sorted(filenames):
        full_path = join(dirname, filename)
        id = filename.split('.')[0]
        testing_agenda[id] = full_path

data        = segment (testing_agenda.keys())

xsubmission = DataFrame(data,columns=['id','predicted'])
xsubmission.to_csv('submission.csv', index = False)
xsubmission.head()