In [26]:
#Dependencies
import numpy as np
import cPickle as pickle
import glob
import matplotlib.pyplot as plt
from matplotlib.widgets import Slider, Button, RadioButtons
#Useful functions
def imshow3d(Im, axis=0, **kwargs):
    """
    Display a 3d ndarray with a slider to move along the 0th dimension.
    Extra keyword arguments are passed to imshow
    """
    im = np.array(Im)
    # generate figure
    f, ax = plt.subplots()
    f.subplots_adjust(left=0.25, bottom=0.25)
    # select first image
    s = [slice(0, 1) if i == axis else slice(None) for i in xrange(3)]
    im_ = im[s].squeeze()
    # display image
    l = ax.imshow(im_, **kwargs)
    l.set_clim(vmin=np.min(im),vmax=np.max(im))
    # define slider
    axcolor = 'lightgoldenrodyellow'
    ax = f.add_axes([0.25, 0.1, 0.65, 0.03], axisbg=axcolor)
    slider = Slider(ax, 'Axis %i index' % axis, 0, im.shape[axis] - 1,
                    valinit=0, valfmt='%i')
    def update(val):
        ind = int(slider.val)
        s = [slice(ind, ind + 1) if i == axis else slice(None)
                 for i in xrange(3)]
        im_ = im[s].squeeze()
        l.set_data(im_)
        f.canvas.draw()
    slider.on_changed(update)
    plt.show()

def chromosomes(nuc_dia=10000,pixel_sz=100,nchr=46,return_im=False,plt_val=False,cutoff=0.):
    """
    nuc_dia is the nuclear diameter in nm
    pixel_sz is in nm
    cutoff is the extent of inter-chromosome invasion in nm
    This assumes 23 or 46 chromosomes (nchr), including homologues
    
    Return list of chromosomes and pixels in their territory (x,y,z locations in unit radius)
    """
    #coordinates for 46 sphere centers
    #(equal size spheres in a unit sphere)
    #see:https://oeis.org/A084827/a084827.txt
    if nchr==23:
        centers=[[-0.090186232268855, -0.12719258042466, -0.707348165072374],
                 [0.38870328320077, 0.123439845780586, -0.598602787948567],
                 [-0.309170583212229, 0.37423843622416, -0.537598436535947],
                 [0.28371423154274, -0.462489029599172, -0.479857338821224],
                 [-0.569458339923172, -0.101071012738899, -0.436067408909527],
                 [-0.253827307361331, -0.551289599956929, -0.39535352616894],
                 [0.176894201455841, 0.586308932531292, -0.386785745293835],
                 [0.669516873175885, -0.164919942120003, -0.22181412232117],
                 [0.60918852030665, 0.373364422255918, -0.118917067887175],
                 [-0.269000949817199, 0.668540524640236, -0.073109449104701],
                 [-0.664012065947263, 0.28441188784869, -0.053392789438103],
                 [0.0, 0.0, 0.0],
                 [-0.593650302641085, -0.414670407675602, 0.016754307192129],
                 [0.434917326049123, -0.576009620033541, 0.060932526357327],
                 [-0.09787458412142, -0.709313014594744, 0.109309417466038],
                 [0.232664986830043, 0.668063108437104, 0.155600844104138],
                 [0.631764146516482, -0.135323377713092, 0.327436378946828],
                 [-0.554830942730164, 0.100304097116244, 0.454702813635502],
                 [-0.191032319228568, 0.514177446849921, 0.473055076440126],
                 [-0.332723553965536, -0.40207224871969, 0.502280677592942],
                 [0.216488806207223, -0.4259266153066, 0.544400634457405],
                 [0.373183313581025, 0.301461270612607, 0.542686145412344],
                 [-0.075730170612561, 0.034661741876543, 0.719525149349393]]
    elif nchr==46:
        centers=[[-0.127724638717686,0.029283782782012,-0.763670872459570], 
                [0.302116854275886,0.146601789724809,-0.698281876003332], 
                [0.050116071438789,-0.375084565347080,-0.676139240788969], 
                [0.387404648096449,-0.300279722464142,-0.600095035492607],
                [-0.221565702064757,0.438003368581342,-0.599521487098418],
                [-0.536838502467010,0.121012629438513,-0.545458207564384],
                [0.470578557122151,-0.324839673302964,-0.522876020618661],
                [0.206821475639773,0.544767478949537,-0.510703040725137],
                [0.647737208453552,0.075787428022586,-0.418398254359731],
                [0.209291510617636,-0.653452063989750,-0.359946924370349],
                [-0.240428762326608,-0.655246890184877,-0.336466711372591],
                [0.027563278735129,0.169874066797150,-0.337139524778479],
                [-0.531122333361574,0.491550397468556,-0.276860250786947],
                [-0.125040038594464,0.718782537235944,-0.260923317520113],
                [-0.028222635427186,-0.267579430698296,-0.245896798982907],
                [0.559897837805783,0.479367416697336,-0.238925962888257],
                [-0.609344934400770,-0.421155893776354,-0.227356083644822],
                [-0.755792906627536,0.000918343779410,-0.170705973387576],
                [0.709453517788630,-0.276107684781292,-0.144237918782831],
                [0.338406350902039,-0.029318746498438,-0.079260341210368],
                [0.256184770042010,0.730938689442354,-0.021501641508632],
                [-0.268046158037773,0.223179830668424,-0.001615424109930],
                [0.463839024087979,-0.620577043697123,0.010090454994701],
                [0.761425580114896,0.142996856131315,0.012137124700828],
                [0.041055031342583,-0.772687639260906,0.040405708106847],
                [-0.343201070932800,-0.214763803705687,0.071596445689072],
                [-0.392969757022585,-0.662069840802751,0.087193008193199],
                [-0.377886422912343,0.667723934061050,0.108217022567140],
                [-0.686352373667351,0.339757482368351,0.117684310970756],
                [0.150619047600183,0.321066162828993,0.132327016008240],
                [0.137964450619487,-0.350718167453077,0.164313718413543],
                [0.559387984377712,0.492787670746059,0.211210130456054],
                [-0.717576062734593,-0.078536494382680,0.281568709115817],
                [0.643403410008865,-0.310581345960640,0.299892559603968],
                [0.002276767510746,0.692083481917933,0.348395549284496],
                [-0.069193117297735,-0.000826838519097,0.357871631431749],
                [-0.074584688024342,-0.626168415760149,0.450238341469810],
                [0.622296753862575,0.114447785021264,0.447227819362128],
                [-0.471682318226388,-0.413806749821993,0.454581223971127],
                [-0.434951569989064,0.423001400164857,0.481924550044267],
                [0.305007962363991,-0.417373667885278,0.577177346197253],
                [0.295340191120549,0.432541638100190,0.571004577504622],
                [-0.446844519125231,-0.001070128504388,0.633003282191707],
                [-0.094303907267779,-0.267030401770297,0.721225250748454],
                [0.281485705865138,0.008444506916010,0.721844036069634],
                [-0.091709170433872,0.260484226789782,0.723948700091940]]
    else:
        assert False

    centers = np.array(centers)
    centers = centers[np.random.permutation(len(centers))]
    arr_size = nuc_dia/pixel_sz #division casts as int
    x_ = np.linspace(-1,1,arr_size) #pixel locations in unit radius
    chrters = [[] for i in range(nchr)]
    cutoff_unit_radius = float(cutoff)/(nuc_dia/2.) #convert invasion length to unit radius
    for x in x_:
        for y in x_:
            for z in x_:
                #test if in sphere
                if x*x+y*y+z*z<=1:
                    dists = np.sqrt(np.sum((centers-[[x,y,z]])**2,axis=1)) #distances to all sphere centers
                    dif_2dist = dists-np.min(dists) #how much farther than the closest one?
                    for ichr,dif_dist in enumerate(dif_2dist):
                        if dif_dist<=cutoff_unit_radius:
                            chrters[ichr].append([x,y,z]) #assign pixel to closest chr. and to others within cutoff dist.
    im = np.zeros([arr_size]*3)
    for i,chr_ in enumerate(chrters):
        for x,y,z in (np.array(chr_)+1)*(arr_size-1)/2: #convert to pixel number (1...arr_size)
            if im[int(np.round(x)),int(np.round(y)),int(np.round(z))]>0:
                im[int(np.round(x)),int(np.round(y)),int(np.round(z))]=i+1+len(chrters)
            else:
                im[int(np.round(x)),int(np.round(y)),int(np.round(z))]=i+1
    if plt_val:        
        imshow3d(im,interpolation='nearest')
    if return_im:
        return chrters,im
    return chrters

def TAD_blur(xyzPos,pix_sz=100,nuc_dia=10000): ###add random 3D Gaussian to pixelized TAD location
    perturb=np.random.normal(0,pix_sz/2./(nuc_dia/2.),3)
    return perturb+xyzPos

def TAD_generator(xyzChr,noTADs=100,udist=-0.44276236166846844,sigmadist=0.57416477624326434,nuc_dia=10000,pix_sz=100):
    """
    xyzChr is a list of positions belonging to a chromosome territory
    noTADs is the number of TADs in the chromosome
    udist, sigmadist are the lognormal mean and variance of the distance distribution from consecutive TADs, calculated from
    actual data on chr 21 and 22 (Steven's published data), in units of log(um)
    nuc_dia, pix_sz are the nuclear diamater and pixel size in nm (see above)
    Returns an array of dimensions noTADSx3, representing the 3D location of all TADs in a chromosome
    """
    xyzChr_=np.array(xyzChr)
    tads=[]
    first=xyzChr_[np.random.randint(len(xyzChr))] #randomly choose location of first TAD
    first=TAD_blur(first) #blur so effective resolution is better than pixel
    tads.append(first)
    for i_tad in range(noTADs-1): #sequentially add TADs at distance defined by lognormal distribution
        difs=xyzChr_-[tads[i_tad]]#unit radius
        dists=np.sqrt(np.sum(difs**2,axis=-1))
        dists=np.log(dists*nuc_dia/2000.)#unit log um
        weights = np.exp(-(dists-udist)**2/(2*sigmadist**2))
        weights = np.cumsum(weights)
        weights = weights/float(np.max(weights))
        index_pj = np.sum(np.random.rand()-weights>0)
        pj=xyzChr_[index_pj]#unit radius
        pj=TAD_blur(pj)
        tads.append(pj)
    return np.array(tads)

In [29]:
#Simulate multiple single cells and save to file
import cPickle as pickle
cutoffs=np.linspace(0,500,6)
reals=20 #number of realizations
tads=100 #number of TADs per chromosome

for num_chr_ in (23,46):
    print '#Chr:'+str(num_chr_)
    for cutoff_ in cutoffs:
        chrters=chromosomes(nuc_dia=10000,pixel_sz=100,nchr=num_chr_,plt_val=False,return_im=False,cutoff=cutoff_)
        real_matrix=[]
        print 'Cutoff:'+str(cutoff_)
        for i_rel in range(reals):
            single_cell=[]
            print 'Cell:'+str(i_rel)
            for chrter in chrters:
                tads_=TAD_generator(chrter,tads)
                single_cell.append(tads_)
            real_matrix.append(single_cell)
        real_matrix=np.array(real_matrix)
        pickle.dump(real_matrix,open('simulatedTads_'+str(num_chr_)+'_cut'+str(int(cutoff_))+'.pkl','wb'))

#Chr:23
Cutoff:0.0
Cell:0
Cell:1
Cell:2
Cell:3
Cell:4
Cell:5
Cell:6
Cell:7
Cell:8
Cell:9
Cell:10
Cell:11
Cell:12
Cell:13
Cell:14
Cell:15
Cell:16
Cell:17
Cell:18
Cell:19
Cutoff:100.0
Cell:0
Cell:1
Cell:2
Cell:3
Cell:4
Cell:5
Cell:6
Cell:7
Cell:8
Cell:9
Cell:10
Cell:11
Cell:12
Cell:13
Cell:14
Cell:15
Cell:16
Cell:17
Cell:18
Cell:19
Cutoff:200.0
Cell:0
Cell:1
Cell:2
Cell:3
Cell:4
Cell:5
Cell:6
Cell:7
Cell:8
Cell:9
Cell:10
Cell:11
Cell:12
Cell:13
Cell:14
Cell:15
Cell:16
Cell:17
Cell:18
Cell:19
Cutoff:300.0
Cell:0
Cell:1
Cell:2
Cell:3
Cell:4
Cell:5
Cell:6
Cell:7
Cell:8
Cell:9
Cell:10
Cell:11
Cell:12
Cell:13
Cell:14
Cell:15
Cell:16
Cell:17
Cell:18
Cell:19
Cutoff:400.0
Cell:0
Cell:1
Cell:2
Cell:3
Cell:4
Cell:5
Cell:6
Cell:7
Cell:8
Cell:9
Cell:10
Cell:11
Cell:12
Cell:13
Cell:14
Cell:15
Cell:16
Cell:17
Cell:18
Cell:19
Cutoff:500.0
Cell:0
Cell:1
Cell:2
Cell:3
Cell:4
Cell:5
Cell:6
Cell:7
Cell:8
Cell:9
Cell:10
Cell:11
Cell:12
Cell:13
Cell:14
Cell:15
Cell:16
Cell:17
Cell:18
Cell:19
#Chr:46
Cutoff:0

In [126]:
#Load multiple single cells from file

import cPickle as pickle
real_matrix = pickle.load(open('simulatedTads23.pkl','r'))
real_matrix = real_matrix*5000 #Transform to nm

In [233]:
#plot the Tads in a few chromosomes

import matplotlib.pylab as plt
for i,chr_ in enumerate(real_matrix[0]):
    x,y,z = chr_.T
    #plt.text()
    plt.plot(x,y,'wo')
for i,chr_ in enumerate(real_matrix[0]):
    x,y,z = chr_.T
    if i>5:
        break
    #plt.text()
    plt.plot(x,y,'o')
plt.axis('equal')
plt.show()

In [106]:
#Encoder - construct a matrix hybes of length number of hybes x number of chromosomes 
#each containing the id of the tad in the hybe (0 means the TAD is missing from that hybe)
import itertools
import numpy as np
def combs_to_code(combs_eq_sp,nchr=None):
    """Construct from combs list to code binary array
    For example changes:
    combs_eq_sp = [(0,1,2,3),(0,1,2,4)] to codes = [[1,1,1,1,0],[1,1,1,0,1]]
    """
    if nchr is None:
        nchr = np.max(combs_eq_sp)+1
    codes = np.zeros([len(combs_eq_sp),nchr],dtype=int)
    for i,comb in enumerate(combs_eq_sp):
        codes[i][list(comb)] = 1
    return codes
def test_code(codes):
    """If chromosme i apears in a subset of hybes. Check to see that no other chromosomes appears in the same set."""
    nchr = codes.shape[-1]
    print "No. of tads:"
    print np.unique(np.sum(codes,axis=0))
    print "No. of chrms labeled/hybe:"
    print np.unique(np.sum(codes,axis=1)),np.mean(np.sum(codes,axis=1)),np.std(np.sum(codes,axis=1))
    unique_encoding = np.prod([np.sum(np.prod(codes[codes[:,ichr]==1,:],axis=0))==1 for ichr in range(nchr)])==1
    return unique_encoding
def patch_code(codes,target):
    nchr = codes.shape[-1]
    for ichr in range(nchr):
        code = codes[:,ichr]
        n1s = np.sum(code)
        ndel1s = n1s-target
        if ndel1s>0:
            pos1s = np.where(code)[0]
            del_pos = np.random.choice(pos1s,size=ndel1s,replace=False)
            code[del_pos]=0
        elif ndel1s<0:
            pos0s = np.where(code==0)[0]
            del_pos = np.random.choice(pos0s,size=np.abs(ndel1s),replace=False)
            code[del_pos]=1
    return codes
def code_encoder(nchr=23,ntads=100,nlabel_=2,no_hom=1):
    """Master function for the encoder
    nchr is the number of *unique* - i.e. non-homologous - chromosomes
    no_hom is the number of homologous chromosomes
    nlabel is the numbr of TADs labeled in each hybe
    #Interpretation of codes: codes is number of hybe x number of chromosomes and indicates which chr is present in each hybe

    #Interpretation of hybes: hybes is number of hybe x number of chromosomes and indicates which TAD is present in each hybe
    #                         0 means chromose not appearing and if not 0 then it encodes which TAD from the chr appears
    Return hybes
    
    ###Example use:
    hybes = code_encoder(nchr=23,ntads=100,nlabel_=10)
    """
    combs = list(itertools.combinations(range(nchr),nlabel_))
    nhybes = int(float(nchr)*ntads/nlabel_)+1
    inds = np.array(np.round(np.linspace(0,len(combs)-1,nhybes)),dtype=int)
    combs_eq_sp = [combs[ind] for ind in inds]
    codes = combs_to_code(combs_eq_sp)
    codes = patch_code(codes,target=ntads)
    assert test_code(codes)
    hybes=np.cumsum(codes,axis=0)*codes
    hybes=np.concatenate([hybes.T]*no_hom).T
    return hybes

In [105]:
def simulated_imdata(hybes,cell,err_rate=0.032504222398951552):
    """
    Inputs:
    hybes is the encoding(see above)
    cell is ground truth for single cell (no_of_chr x no_of_TADS x 3)
    err_rate is the rate at which a TAD is missed (averaged over 4 chromosomes from Steven's published data)
    Returns:
    hybes_points
    a list of dim no_of_hybes with lists of x,y,z points - the simulated imaging data
    tot_ground_truth is the chromosome identity of all imaged points per hybe
    """
    hybes_points,tot_ground_truth=[],[]
    for hybe in hybes:
        chrs_in_hybe = np.where(hybe>0)[0]
        tad_ids_in_hybe = hybe[hybe>0]-1
        hybe_points,ground_truth=[],[]
        for chr_in_hybe,tad_in_hybe in zip(chrs_in_hybe,tad_ids_in_hybe):
            if np.random.rand()>err_rate: #probability of missing a TAD in imaging
                hybe_points.append(cell[chr_in_hybe][tad_in_hybe])
                ground_truth.append(chr_in_hybe)
        hybes_points.append(hybe_points)
        tot_ground_truth.append(ground_truth)
    hybes_points = map(np.array,hybes_points)
    return hybes_points,tot_ground_truth

In [104]:
#Decoder - Given hybes_points and hybes predict chr id
def decoder(hybes_points,hybes,tot_ground_truth,no_hom=1,n_chr=23):
    #What chromosomes appear in which hybe
    possible_chrs_hybes=[]
    for hybe in hybes:
        possible_chrs_hybes.append(np.where(hybe>0)[0])
    ##
    goods,bads=0,0
    chromosome_ids_all = []
    #Iterate through all the points in the hybes. The current hybe I call it ref hybe
    for id_ref in range(len(hybes_points)):
        ###Given id_ref hybe compute the projection space
        hybes_points_ref = hybes_points[id_ref]
        
        possible_chrs = possible_chrs_hybes[id_ref]#np.where(hybes[id_ref]>0)[0]
        
        #compute possible projections: possibble chromosome x numbe of hybes - binary
        possible_projections = np.zeros([len(possible_chrs),len(possible_chrs_hybes)],dtype=int)
        for i,chr_T in enumerate(possible_chrs):
            for j,possible_chrs_hybe in enumerate(possible_chrs_hybes):
                possible_projections[i,j]=chr_T in possible_chrs_hybe
        sum_proj = np.array([np.sum(possible_projections,axis=1)])
        #sum_proj[sum_proj==0]=1
        possible_projections_ = possible_projections*1./sum_proj.T #the normalized projection space
        
        ###Compute 
        projections_point = []
        for point in hybes_points_ref:
            min_L1_dists=[]#distances to nearest neighbors across hybes for point
            for hybe_point in hybes_points:
                difs = point - hybe_point
                #min_L1_dist = np.min(np.sum(np.abs(difs),axis=-1))
                min_L1_dist = np.min(np.sqrt(np.sum(difs**2,axis=-1)))
                min_L1_dists.append(min_L1_dist)
            min_L1_dists = np.array(min_L1_dists)#nearest neighbour distance across hybes for point in reference hybe

            projection = np.dot(possible_projections_,min_L1_dists)
            projections_point.append(projection)
        ##After computing a no of candidate chromosomes x no of points weight matrix projections_point
        ## Decide on best assigment.
        def conf_proj(projection):
            #given a projection compute the "confidence" for it as the difference between the two smalles distance weights.
            unk = np.unique(projection)#this also sorts
            if len(unk)<2:
                return unk[0]
            else:
                return unk[1]-unk[0]
        
        projections_point_=np.array(projections_point)
        point_ids = np.arange(projections_point_.shape[0])
        chr_ids = np.arange(projections_point_.shape[1])

        chr_picks=[]
        while projections_point_.shape[0]>0:
            confs = map(conf_proj,projections_point_)# list of confidence for the remaining points across the remaining chrs.
            point_ind = np.argmax(confs)# the id of the point with the highest confidence
            chr_ind = np.argmin(projections_point_[point_ind])  # the id ot the chromosome assiged to the most confident point
            chr_picks.append((point_ids[point_ind],chr_ids[chr_ind])) #keep above pair

            point_ind_keep = np.setdiff1d(np.arange(projections_point_.shape[0]),[point_ind])
            chr_ind_keep = np.setdiff1d(np.arange(projections_point_.shape[1]),[chr_ind])
            point_ids = point_ids[point_ind_keep]
            chr_ids = chr_ids[chr_ind_keep]
            projections_point_ = projections_point_[point_ind_keep,:]
            projections_point_ = projections_point_[:,chr_ind_keep] #killing rows and columns
        
        points_identities,chr_identities = zip(*chr_picks)
        #chr_identities goes from 0 to number of chromosomes is ref hybe in maximum confidence order
        chromosome_ids0 = np.arange(len(points_identities))
        chromosome_ids0[np.array(points_identities)]=np.array(chr_identities)
        chromosome_ids = possible_chrs[chromosome_ids0]%n_chr
        chromosome_ids_all.append(chromosome_ids)
        #chromosome_ids is chromosome prediction (0-22) in order of the points in ref hybe.
        #Compare to ground truth calculated during simulation of imaging data.
        non_deg_poss=np.array(tot_ground_truth[id_ref])%n_chr
        good = np.sum(non_deg_poss==chromosome_ids) #up to degeneracy due to homologous chromosomes
        bad = np.sum(non_deg_poss!=chromosome_ids)
        goods+=good
        bads+=bad
                      
    return goods,bads,chromosome_ids_all

In [None]:
import numpy as np

"""" 
Tries to tell appart homologous chromosomes based on physical distance
""""
def separate_homologues(chromosome_ids_all,hybes_points,n_chr=23,no_hom=2):
    for chr_ in range(n_chr):
        #unassigned_hybes=np.arange(len(hybes_points))
        baskets=[] #baskets for each homologous chromosome
        for hybe_ in range(len(hybes_points))
            indx_=np.where(chromosome_ids_all[hybe_]==chr_)
            points_=hybes_points[hybe_][indx_]
            num_=len(indx_)
            for i,pt in enumerate(indx_):
                dists_=
            #unassigned_hybes=np.setdiff1d(unassigned_hybes,hybe_)
    return chromosome_ids_all

In [183]:
reload(ts)
num_chr =23
num_hom=2
split_dic ={}
for chr_id in range(num_chr):
    split_dic[chr_id]=map(list,[[]]*num_hom)
im_data_enhanced = map(list,im_data)
for ihybe in range(len(im_data_enhanced)):
    im_data_enhanced_hybe = im_data_enhanced[ihybe]
    for ipoint in range(len(im_data_enhanced_hybe)):
        point = im_data_enhanced_hybe[ipoint]
        point = list(point)+[ihybe,ipoint]
        im_data_enhanced_hybe[ipoint] = point
    im_data_enhanced[ihybe] = im_data_enhanced_hybe
#iterate through chromosomes.
pts_chrs,chr_hybes = [],[]
for pts_hybe,chr_hybe in zip(im_data_enhanced,interp):
    pts_chr_ = ts.partition_map(pts_hybe,chr_hybe)
    chr_hybe_ = np.unique(chr_hybe)
    pts_chrs.extend(pts_chr_)
    chr_hybes.extend(chr_hybe_)
pts_partitioned = ts.partition_map(pts_chrs,chr_hybes)
chrs_ids = np.unique(chr_hybes)

#iterate through chromosome ids
chrs_id = chrs_ids[0]
pts_partitioned_ = pts_partitioned[0]

id_hybe_start = np.argsort(map(len,pts_partitioned_))[-1]
pts_start = pts_partitioned_[id_hybe_start]
assert len(pts_start)==num_hom
split_chr = split_dic[chrs_id]
for ipt,pt in enumerate(pts_start):
    split_chr[ipt].append(pt)

pts_tobeasigned = list(pts_partitioned_)
pts_tobeasigned.pop(id_hybe_start)

chr_estim = [[list(val)+[ival]for ival,val in enumerate(pts_partitioned_[id_hybe_start])]]

for pts_hybe in pts_tobeasigned:
    mean_dists_hybe = []
    for pt in pts_hybe:
        mean_dists = [np.mean([np.sqrt(np.sum((pt[:3]-pt_t[:3])**2)) for pt_t in split_]) for split_ in split_chr]
        #mean_dists has num of homolog dists
        mean_dists_hybe.append(mean_dists)
    picks = ts.unique_classif(mean_dists_hybe)
    chr_estim.append([list(pts_hybe[pick[0]][:])+[pick[1]] for pick in picks])

#Return the sames as interp(chromosme_id_all) but dealt with degeneracy


In [160]:
pts_chr0 = np.array(ts.flatten(pts_partitioned_))[:,:3]
x,y,z = pts_chr0.T
plt.plot(x,z,'o')
plt.show()

In [187]:
import matplotlib.pyplot as plt
flat_ = np.array(ts.flatten(chr_estim))
for i in [0,1]:
    x,y,z = flat_[flat_[:,-1]==i,:3].T
    plt.plot(x,z,'o')
plt.show()

In [181]:
#separate points to hom
reload(ts)
chr_estim_ = list(chr_estim)

for i_loop in range(2):
    flat_ = np.array(ts.flatten(chr_estim_))
    for i in [0,1]:
        x,y,z = flat_[flat_[:,-1]==i,:3].T
        plt.plot(x,z,'o')
    plt.show()
    hom_buckets = map(list,[[]]*num_hom)
    for pt in ts.flatten(chr_estim_):
        hom_buckets[pt[-1]].append(pt[:3])
    hom_buckets = map(np.array,hom_buckets)

    counter_flip = 0
    for ihyb,pick in enumerate(chr_estim_):
        
        num_pts = len(pick)
        possibilities = np.array(list(itertools.combinations(range(num_hom),num_pts)))
        current_possib = np.array([elem[-1] for elem in pick])
        weight_posib = []
        for posib in possibilities:
            means_of_means = []
            for ipick,ipos in enumerate(posib):
                mean_dist = np.mean(np.sqrt(np.sum(([pick[ipick][:3]]-hom_buckets[ipos])**2,axis=-1)))
                means_of_means.append(mean_dist)
            means_of_means = np.mean(means_of_means)
            weight_posib.append(means_of_means)
        best_posib = possibilities[np.argmin(weight_posib)]
        #compare with current possib
        counter_flip+=np.prod(current_possib==best_posib)

        #assign best posib
        for ipick,ipos in enumerate(best_posib):
            pick[ipick][-1]=ipos
        chr_estim_[ihyb]=pick

    print counter_flip 
    
    

63
100


In [121]:
import itertools


array([[0, 1],
       [1, 0]])

In [89]:
arr_ = [1,2,3,4]
arr_[:3]

[1, 2, 3]

In [76]:
def conf(pair,list_split):
    for 

[array([0.015508228453001233, -0.48560677606662095, -0.85686744358991784,
        99, 12], dtype=object),
 array([0.012061786974621725, -0.48278758745713457, 0.64646944012208563, 99,
        23], dtype=object)]

array([62, 97, 76, 46, 80,  9, 36, 70, 69, 68, 67, 66, 65, 64,  0, 71, 61,
       60, 59, 58, 57, 56, 55, 54, 53, 63, 72, 74, 52, 96, 95, 94, 93, 92,
       91, 90, 89, 88, 73, 87, 85, 84, 83, 82, 81, 79, 78, 77, 75, 86, 51,
       49, 98, 21, 20, 19, 18, 17, 16, 15, 14, 13, 22, 12, 10,  8,  7,  6,
        5,  4,  3,  2,  1, 11, 23, 24, 25, 48, 47, 45, 44, 43, 42, 41, 40,
       39, 38, 37, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 50, 99], dtype=int64)

In [None]:
#iterate through hybes
pts_hybe = im_data[0]
chr_hybe = interp[0]
#ts.partition_map()
pts_chr_ = ts.partition_map(pts_hybe,chr_hybe)
chr_hybe_ = np.unique(chr_hybe)
#iterate through pairs of points
pts_pair = pts_chr_[0]
chr_id = chr_hybe_[0]

buckets = split_dic[chr_id]
pts_pair,chr_id,buckets
if len(buckets)

In [26]:

#unassigned_hybes=np.arange(len(hybes_points))
baskets=[] #baskets for each homologous chromosome
for hybe_ in range(len(hybes_points))
    indx_=np.where(chromosome_ids_all[hybe_]==chr_)
    points_=hybes_points[hybe_][indx_]
    num_=len(indx_)
    for i,pt in enumerate(indx_):
        dists_=
    #unassigned_hybes=np.setdiff1d(unassigned_hybes,hybe_)

AttributeError: 'tuple' object has no attribute 'size'

In [29]:
import TAD_SIM as ts

In [176]:
reload(ts)
hybes=ts.code_encoder(nchr=23,ntads=100,nlabel_=16,no_hom=2)
chrters=ts.chromosomes(nuc_dia=10000,pixel_sz=100,nchr=46,plt_val=False,return_im=False,cutoff=0.)
cell=[]
for chrter in chrters:
    tads_=ts.TAD_generator(chrter,noTADs=100)
    cell.append(tads_)
cell=np.array(cell)
im_data,truth=ts.simulated_imdata(hybes,cell)
corr,incorr,interp=ts.decoder(im_data,hybes,truth,no_hom=2,n_chr=23)

No. of tads:
[100]
No. of chrms labeled/hybe:
[14 15 16 17 18] 15.9722222222 0.600282855138


In [37]:
np.unique(interp[0],return_counts=True)

(array([ 0,  1,  2,  3,  4,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15], dtype=int64),
 array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2], dtype=int64))

In [108]:
import cPickle as pickle
pickle.dump([corr,incorr,interp,hybes,cell,im_data,truth],open('trial_for_separator.pkl','wb'))

In [272]:
#Given hybes(encoder matrix) and cell(truth positions for single cell) simulate hybe data
import time
for nlabel_ in [10,12,14,16,18,20]:
    start=time.time()
    print "Label: "+str(nlabel_)
    hybes = code_encoder(nchr=23,ntads=100,nlabel_=nlabel_)
    
    goods_real,bads_real=[],[]
    for icell,cell in enumerate(real_matrix[:10]):
        print "Cell:"+str(icell)
        hybes_points = simulated_imdata(hybes,cell)
        goods,bads,chromosome_ids_all = decoder(hybes_points,hybes)
        goods_real.append(goods)
        bads_real.append(bads)
    #pickle.dump([goods_real,bads_real],open('decoded_'+str(nlabel_)+'.pkl','wb'))
    print "Elapsed time: "+str(time.time()-start)

Label: 10
No. of tads:
[100]
No. of chrms labeled/hybe:
[ 8  9 10 11 12] 9.95670995671 0.636437254628
Cell:0
Cell:1


KeyboardInterrupt: 

In [152]:
goods,bads = pickle.load(open('decoded_16.pkl','rb'))
np.mean(bads)/2300.*100.

0.82173913043478253

In [162]:
import glob
files = glob.glob('decoded*.pkl')
erros_mean,erros_std=[],[]
nlabels=[]
for file_ in files:
    goods,bads = pickle.load(open(file_,'rb'))
    goods = np.array(goods,dtype=float)
    bads = np.array(bads,dtype=float)
    ratios = bads/(goods+bads)
    nlabels.append(int(file_.split('_')[-1].replace('.pkl','')))
    erros_mean.append(np.mean(ratios))
    erros_std.append(np.std(ratios))

In [175]:
x,y=np.log(np.array(nlabels)/23.),np.log(erros_mean)
A = np.vstack([x, np.ones(len(x))]).T
m, c = np.linalg.lstsq(A, y)[0]
#error=exp(c)*n^m
m,c

(6.6878810599782703, -1.9817292501066128)

In [178]:
plt.errorbar(nlabels, erros_mean, yerr=erros_std, fmt='-o')
plt.show()

In [142]:
#Distances between points and how close they are on average to tads of same chromosome
cell = real_matrix[9]
wrong = []
for cell in real_matrix:
    chr_id = 10
    ref_chr = cell[chr_id]
    estimator = []
    for ref_point in ref_chr:
        dist_ref = np.sqrt(np.sum((ref_chr - [ref_point])**2,axis=-1))
        chr_dists=[]
        for chr_ in cell:
            dist = np.sqrt(np.sum((chr_ - [ref_point])**2,axis=-1))
            chr_dists.append(np.mean(dist))
        estimator.append(np.argmin(chr_dists))
    wrong.append(np.sum(np.array(estimator) != chr_id))

In [143]:
np.mean(wrong)

8.3300000000000001

In [114]:
ntads/0.66

151.5151515151515

In [60]:
import glob
import numpy as np
files = glob.glob('*.csv')
#files= ['chr21.csv', 'chr22.csv']
#file_ =files[0]
def file_to_mat(file_):
    lines = [ln for ln in open(file_,'r')]
    def refine_line(ln):
        splits = ln[:-1].split(',')
        return [np.nan if ln_=='' else float(ln_)for ln_ in splits]
    lines = map(refine_line,lines[1:])
    return np.array(lines)
def data_to_dists(data):
    icell_prev=np.nan
    iTAD_prev=np.nan
    dists = []
    for icell,iTAD,x,y,z in data:
        if icell_prev==icell:
            xyz = np.array([x,y,z])
            dist = np.sqrt(np.sum((xyz-xyz_prev)**2))
            dists.append(dist)
        icell_prev=icell
        xyz_prev = np.array([x,y,z])
    dists = np.array(dists)
    dists = dists[np.isnan(dists)==False]
    return dists
def count_missed(data):
    counter=0
    z=data[:,-1]
    print len(z)
    for i in range(len(z)):
        if np.isnan(z[i])==True:
            counter=counter+1
    return float(float(counter)/float(len(z)))

missed_per=[]
for j in range(len(files)):
    missed_per.append(count_missed(file_to_mat(files[j])))
#dists0 = data_to_dists(file_to_mat(files[0]))
#dists1 = data_to_dists(file_to_mat(files[1]))
#dists = np.concatenate([dists0,dists1])


3330
4080
4077
3800


In [66]:
np.mean(missed_per)

0.032504222398951552

In [43]:
import matplotlib.pylab as plt
#plt.hist(np.log(dists0),bins=30,alpha=0.7)
#plt.hist(np.log(dists1),bins=30,alpha=0.7)
#plt.hist(dists0,bins=30,alpha=0.7)
#plt.hist(dists1,bins=30,alpha=0.7)
plt.hist(dists,bins=40)
plt.show()

In [49]:
np.std(dists)

0.45172828742659688

In [48]:
np.mean(dists)

0.75397658663865952