In [None]:
%matplotlib inline
from ccount import load_blobs_db, show_rand_crops, blobs_stat, pca_tsne, parse_blobs, area_calculation, sub_sample
from ccount import save_blobs_db
import glob
import matplotlib
import matplotlib.pyplot as plt
from os import environ
import numpy as np
from math import sqrt
import pandas as pd
from pathlib import Path
import random
Path("blobs").mkdir(parents=True, exist_ok=True)
Path("split_blobs").mkdir(parents=True, exist_ok=True)
Path("report").mkdir(parents=True, exist_ok=True)

In [None]:
# cmd to run:
# dirname=IL17a_Fusion_CFUe_28FEB20 runipy merge.downsample.area_calcu.view.save.ipynb IL17a_Fusion_CFUe_28FEB20.ipynb

In [None]:
# Params
# num of blobs to keep per exp
n_total_good_blobs_per_exp = 13000
n_total_blobs_per_exp = int(2 * n_total_good_blobs_per_exp) # assuming ~50% good blobs (not too large/small)

# filtering blobs based on area in pixels
min_area = 500 # default 500
max_area = 5000 # default 5000
small_blobs_percent_kept = 0 # 0 for best performance, 0.05 for robustness

if environ.get('dirname') is not None:
    name = environ['dirname']  # for runipy
else:
    name = "E2F4_CFUe_14JUN19" # for notebook

In [None]:
# get num of blobs to subsample from each experiment
n_exp = len([x for x in glob.iglob('../data2/' + name + '/*.npy.gz')])
n_each_image = int(n_total_blobs_per_exp / n_exp)
print(n_exp, n_each_image)

In [None]:
# read all blobs from dir
i = 0
list_crops = []
for fname in glob.iglob('../data2/' + name + '/*.npy.gz'):
    i+=1
    print(i, fname)
    crops_ = load_blobs_db(fname, n_each_image)
    list_crops.append(crops_)
crops =  np.concatenate(list_crops, axis=0)
print('merged crops:', crops.shape)

In [None]:
# hist of blob diameter
w = int(sqrt(crops.shape[1]-6)) # padding width & cropped img width/2
r_ = crops[:,2]
plt.hist(r_, 40)
plt.show()

In [None]:
show_rand_crops(crops=crops, label_filter=1, num_shown=5)

In [None]:
show_rand_crops(crops=crops, label_filter=0, num_shown=5)

In [None]:
show_rand_crops(crops=crops, label_filter=-2, num_shown=5)

In [None]:
x = show_rand_crops(crops=crops, label_filter="na", num_shown=10)

In [None]:
blobs_stat(crops)

# Area calculation

In [None]:
# calculation
Images, Labels, Rs = parse_blobs(crops)
areas = [area_calculation(image, r=Rs[ind], plotting=False) for ind, image in enumerate(Images)]

In [None]:
# hist
plt.hist(areas, 40)
plt.title("Blob area in pixcels")
plt.show()

In [None]:
# small blobs
idx = [(x > 0 and x <min_area) for x in areas]
small_blobs = crops[idx,:]
print("num small blobs", len(small_blobs))
_ = show_rand_crops(small_blobs, plot_area=False)

In [None]:
# good blobs kept for labeling and ccount
idx2 = [(x > min_area and x < max_area) for x in areas]
good_blobs = crops[idx2,:]
print("num good blobs", len(good_blobs))
_ = show_rand_crops(good_blobs, plot_area=False)

In [None]:
# huge blobs
idx3 = [(x > max_area) for x in areas]
huge_blobs = crops[idx3,:]
print("num huge blobs", len(huge_blobs))
_ = show_rand_crops(huge_blobs, plot_area=False)

In [None]:
print("too small")
blobs_stat(small_blobs)
print("good:")
blobs_stat(good_blobs)
print("too large:")
blobs_stat(huge_blobs)
print("total:")
blobs_stat(crops)

# PCA overview

In [None]:
cluster_info = pd.DataFrame([int(x) - int(y)  for x, y in zip(idx3, idx)], columns=['BlobsKept']) # too big as 1, too small as -1
tsne_df = pca_tsne(pd.DataFrame(crops), cluster_info=cluster_info, 
                  title = "too big as 1, too small as -1")

In [None]:
# Save results as a whole (skipped for RAM issues)
n_good = good_blobs.shape[0]
small_blobs_sample = sub_sample(small_blobs, int(n_good * 0))  # skipped for best ML performance
out_blobs = np.vstack((good_blobs, small_blobs_sample))

n_out = out_blobs.shape[0]
np.random.seed(1)
out_blobs = out_blobs[np.random.choice(n_out, n_out, replace = False), :] # randomized
np.random.seed()

# save_blobs_db(out_blobs[0:13000, ], "blobs/" + name + ".npy")
#print("saved ", out_blobs.shape[0] , "blobs into: ", "blobs/" + name + ".npy")
print("Got ", out_blobs.shape[0] , "blobs")
print(n_good, "good blobs, ", int(n_good * 0), "too small blobs", 0, "too large blobs")

In [None]:
# Save all in 500 split
import string

bsize = int(500)
m = out_blobs.shape[0]//500
m = min(m, 26) # max to z
print(m, "files")

for i in range(0, m):
    idx_ = range(i*bsize, bsize*(i+1))
    mark = list(string.ascii_lowercase)[i]
    oname = "split_blobs/" + name + "." + mark + ".npy"
    print(oname , min(idx_), max(idx_))
    _ = out_blobs[idx_, :]
    #np.save(oname, _)
    save_blobs_db(_, oname)

In [None]:
import psutil
psutil.virtual_memory()