In [172]:
import numpy as np
import glob
import cPickle as pickle
import matplotlib.pyplot as plt
import scipy as sci
from sklearn.decomposition import PCA

def file_to_mat(file_,sep_str=',',d_type=None,skip_first=False):
    """
    Converts .csv files to a list of its entries
    Inputs:
    file_ - the location of a .csv file
    sep_str - the separator between data points
    d_type - the datatype in the file
    skip_first - an option to skip the first component (e.g. if there's a menu)
    Returns:
    lines - a list of the lines in the file, each of which itself a list of all entries in the line
    """
    lines = [ln for ln in open(file_,'r')]
    start_=(1 if skip_first==True else 0) #skip first line if option selected
    def refine_line(ln,d_type=None,skip_first=False): #separates the data into its constituents
        splits = ln[:-1].split(sep_str)
        if d_type is None:
            return [ln_ for ln_ in splits[start_:]]
        if d_type=='int':
            return [np.nan if ln_=='' else int(ln_) for ln_ in splits[start_:]]
        if d_type=='float':
            return [np.nan if ln_=='' else float(ln_) for ln_ in splits[start_:]]
    d_type_arr,skip_arr=[],[]
    for i in range(len(lines[start_:])):
        d_type_arr.append(d_type)
        skip_arr.append(skip_first)
    lines = map(refine_line,lines[start_:],d_type_arr,skip_arr)
    return lines

def data_to_tads(data):
    """
    Takes split data read by file_to_mat and converts it into TAD boundaries
    Inputs:
    data - the data returned from file_to_mat
    Returns:
    tads - a list of TADs and their boundaries
    """
    iTAD,prev_chr=0,-1
    tads=[]
    for chr_,start,end in data:
        if chr_[3:]=='X':
            ichr=0
        else:
            ichr=int(chr_[3:])
        if ichr!=prev_chr:
            iTAD=1
        else:
            iTAD+=1
        prev_chr=ichr
        start_=int(start)
        end_=int(end)
        mid_=int((start_+end_)/2)
        tad=[ichr,iTAD,start_,end_,mid_]
        tads.append(tad)
    return tads

def binner(raw_data,bin_sz,method='sites',cuts=None):
    """
    Converts list of interction frequencies (from file_to_mat) into a binned list
    Inputs:
    raw_data - data from file_to_mat
    method - 'sites' or 'kb_size', specifies binning method
    cuts - if binning by genomic distance, a list of cutsites
    Returns:
    binned - a binned version of raw_data
    """
    if method=='sites': #calculate number of bins and indices depending on 
        nbins=int(np.ceil(float(len(raw_data))/bin_sz))
        bins_ind=[[a*bin_sz for a in range(nbins)],len(raw_data)]
    elif method=='kb_size':
        if cuts is None:
            print 'No distances provided'
            assert False
        bins_ind=[0]
        for cur_ind in range(len(cuts)):
            cut_loc=cuts[cur_ind]
            bin_tracker=len(bins_ind)
            if cut_loc>=bin_sz*bin_tracker:
                bins_ind.extend([cur_ind])
        if bins_ind[-1]!=len(raw_data):
            bins_ind.extend([len(raw_data)])
        nbins=len(bins_ind)-1
    binned=[]
    for i in range(nbins):
        row_ind=range(int(bins_ind[i]),int(bins_ind[i+1]))
        binned_row=np.zeros(nbins)
        for j in row_ind:
            row_=[]
            for k in range(nbins): 
                col_ind=range(int(bins_ind[k]),int(bins_ind[k+1]))
                sum_=0.
                for l in col_ind:
                    sum_+=float(raw_data[j][l])
                row_.extend([sum_])
            binned_row+=row_
        binned.append(binned_row)
    return binned

def pearson_corr_mat(bin_data):
    nbins=len(bin_data)
    cij=[]
    for row_i in bin_data:
        c_row=[]
        for jcol in range(len(bin_data)):
            col_j=[row_[jcol] for row_ in bin_data]
            p_cor,p_val=sci.stats.pearsonr(row_i,col_j)
            c_row.extend([p_cor])
        cij.append(c_row)
    return cij
    
#def corr_to_PCA(corr_data):
    
def seq_to_cutsites(seq_file,cut_seq):
    """
    Inputs:
    file_ - a text or FASTA sequence file
    cut_seq - the sequence of the cut site for a restriction enzyme
    Returns:
    seq - a string version of the DNA sequence
    cuts - an array of the location of the beginning of each cut site
    dists - the distance between each cutsite and teh next, or the end of the chromosome
    """
    lines = [ln for ln in open(seq_file,'r')]
    seq=''
    for ln in lines[1:]:
        seq+=ln[:-1]
    seq=seq.lower()
    index=0
    cuts=[]
    while index<len(seq):
        index=seq.find(cut_seq,index)
        if index==-1:
            break
        cuts.extend([index])
        index+=len(cut_seq)
    difs=[(cuts[i]-cuts[i-1]) for i in range(1,len(cuts))]
    dists=[cuts[0],difs,len(seq)-cuts[-1]]
    return seq,cuts,dists

def bedFile_to_npArray(file_):
    TAD_list=file_to_mat(file_,sep_str='\t',d_type=None,skip_first=False)
    TADs=[]
    for row_ in TAD_list:
        chr_end=row_[0][3:]
        nchr_=22 if chr_end=='X' else (int(chr_end)-1)
        start_,end_=int(row_[1]),int(row_[2])
        TADs.append([nchr_,start_,end_])
    TADs=np.array(TADs)
    return TADs

In [199]:
pearson=[]
for ichr in range(23):
    loc='pearson/chr'+(str(ichr+1) if ichr<22 else 'X')+'_pear'
    chr_=file_to_mat(file_=loc,sep_str=' ',d_type='float',skip_first=False)
    new_chr_=[]
    for j in range(len(chr_)):
        row_=chr_[j][:-1]
        new_chr_.append(row_)
    new_chr_=np.array(new_chr_)
    new_chr_[np.isnan(new_chr_)]=0
    pearson.append(new_chr_)
TADs_19=bedFile_to_npArray(file_='TADs_hg19.bed')
TADs_38=bedFile_to_npArray(file_='TADs_hg38.bed')
TADs=np.concatenate([TADs_19,np.zeros([len(TADs_19),1])],axis=-1)
tot_proj=[]
for jchr in range(len(pearson)):
    cur_chr=pearson[jchr]
    pca_= PCA(n_components=1)
    pca_.fit(cur_chr)
    proj_=np.squeeze(np.dot(cur_chr,pca_.components_.T))
    tot_proj.append(proj_)
    ind_loc=500000*np.arange(len(proj_))
    for tad_id in np.where(TADs[:,0]==jchr)[0]:
        tad_=TADs[tad_id]
        tad_[-1]=np.mean(np.interp(np.linspace(tad_[1],tad_[2],100),ind_loc,proj_))
tot_proj=np.array(tot_proj)
picks=[]
for kchr in range(23):
    chr_tad_locs=np.where(TADs[:,0]==kchr)[0]
    chr_tads=TADs[chr_tad_locs]
    dif = np.percentile(np.abs(chr_tads[:,-1]),30)
    ids_A,ids_B = np.where(chr_tads[:,-1]>dif)[0], np.where(chr_tads[:,-1]<-dif)[0]
    pickA = ids_A[np.array(np.linspace(0,len(ids_A)-1,5),int)]
    pickB = ids_B[np.array(np.linspace(0,len(ids_B)-1,5),int)]
    picks.extend(chr_tads[pickA])
    picks.extend(chr_tads[pickB])
rename_dic = {i:'chr'+str(i+1) if i<22 else 'chrX' for i in range(23)}
dic_19to38={str(tad19):[rename_dic[tad38[0]]]+list(tad38[1:]) for tad19,tad38 in zip(TADs_19,TADs_38)}
fid = open("selected_10TADs_hg38.csv",'w')
fid.write('chr,start,end,compartment_score\n')
for pick in picks:
    str_ = str(list(dic_19to38[str(np.array(pick[:-1],dtype=int))])+[pick[-1]])
    str_ = str_.replace(' ','').replace('[','').replace(']','').replace("'",'')+'\n'
    fid.write(str_)
fid.close()

{0: 'chr1',
 1: 'chr2',
 2: 'chr3',
 3: 'chr4',
 4: 'chr5',
 5: 'chr6',
 6: 'chr7',
 7: 'chr8',
 8: 'chr9',
 9: 'chr10',
 10: 'chr11',
 11: 'chr12',
 12: 'chr13',
 13: 'chr14',
 14: 'chr15',
 15: 'chr16',
 16: 'chr17',
 17: 'chr18',
 18: 'chr19',
 19: 'chr20',
 20: 'chr21',
 21: 'chr22',
 22: 'chrX'}

In [158]:
plt.plot(TADs[TADs[:,0]==20][:,-1],'o-')
plt.show()

In [157]:
plt.plot(tot_proj[20],'o-')
plt.show()

In [182]:
dic_19to38={str(tad19):tad38 for tad19,tad38 in zip(TADs_19,TADs_38)}

In [186]:
for pic in picksB[0]:
    print list(dic_19to38[str(np.array(pic[:-1],dtype=int))])+[pic[-1]]

[0, 13440946, 15120917, -5.4105199758819911]
[0, 80241727, 83881729, -8.0955134352013118]
[0, 158123586, 159123586, -9.5871860430596083]
[0, 198724248, 199564249, -5.0145333971969119]
[0, 246890075, 247370075, -6.4340678399404192]


In [174]:
TADs_19.shape,TADs_38.shape

((2250L, 3L), (2250L, 3L))

In [125]:
np.mean(np.interp(np.linspace(Tad_coords[0],Tad_coords[1],100),coords_pear_gen,proj_))

array([ 1.2,  1.3])

In [132]:
np.zeros(len(TADs)).shape

(2250L,)

In [117]:
TAD_list=file_to_mat(file_='TADs_hg19.bed',sep_str='\t',d_type=None,skip_first=False)
TADs=[]
for row_ in TAD_list:
    chr_end=row_[0][3:]
    nchr_=22 if chr_end=='X' else (int(chr_end)-1)
    start_,end_=int(row_[1]),int(row_[2])
    TADs.append([nchr_,start_,end_])

In [95]:
import glob
a=glob.glob('TADS*')
print a

['TADs_hg19.bed', 'TADs_hg38.bed']


In [88]:
pca = PCA(n_components=1)
pca.fit(mat)
print(pca.explained_variance_ratio_)
proj = np.squeeze(np.dot(mat,pca.components_.T))
dif = np.percentile(np.abs(proj),30)
ids_A,ids_B = np.where(proj>dif)[0], np.where(proj<-dif)[0]
pickA = ids_A[np.array(np.linspace(0,len(ids_A)-1,5),int)]
pickB = ids_B[np.array(np.linspace(0,len(ids_B)-1,5),int)]
np.diff(pickA)

[ 0.74011229]


array([ 53, 118, 196, 125], dtype=int64)

4.9466261216998051

In [89]:
plt.plot(np.squeeze(np.dot(mat,pca.components_.T)),'o-')
plt.show()

In [53]:
plt.imshow(mat,interpolation='nearest')
plt.show()

[ 0.74011229  0.13742805]


In [42]:
pca_chr=[]
for ichr in range(len(pearson)):
    pear_=pearson[ichr]
    chr_pca=mlab.PCA(pear_)
    print chr_pca[0]
    pca_chr.append(chr_pca)

LinAlgError: SVD did not converge

In [20]:
chr1_pear=file_to_mat(file_='pearson/chr1_pear',sep_str=' ',d_type='float',skip_first=False)

In [34]:
a=(1 if False else 2)
print a

2


In [15]:
import cPickle as pickle
import glob
loc='hIMR90/nij/'
files_=glob.glob(loc+'*')
for num_fl,file_ in enumerate(files_):
    chr_=np.array(file_to_mat(file_,sep_str='\t',d_type='float',skip_first=True))
    pickle.dump(chr_,open('IMR90_norm_chr'+str(num_fl+1)+'.pkl','wb'))

In [6]:
import matplotlib.pyplot as plt
raw=file_to_mat('hIMR90/nij/nij.chr5',sep_str='\t',d_type='float',skip_first=True)
plt.imshow(raw,interpolation='nearest')
plt.show()

In [8]:
raw = np.array(raw)

In [13]:
min_nonzero = np.min(raw[raw!=0])
raw_=raw+min_nonzero
plt.imshow(np.log(raw_),interpolation='nearest')
plt.show()

In [5]:
binned=binner(raw,bin_sz=50)
plt.imshow(binned,cmap='hot',interpolation='nearest')
plt.show()

TypeError: int() argument must be a string or a number, not 'list'

In [44]:
lines = [ln for ln in open('genome/chr1.fa','r')]
seq=''
for ln in lines[1200:2000]:
    seq+=ln[:-1]
seq=seq.lower()
index=0
cuts=[]
while index<len(seq):
    index=seq.find('aagctt',index)
    if index==-1:
        break
    cuts.extend([index])
    index+=len('aagctt')
print seq[:100]
print cuts
difs=[(cuts[i]-cuts[i-1]) for i in range(1,len(cuts))]
print difs

gtaattcagacattaattgcttttgttttggaattgctcttataagatgaaatatcactttcatgatgagagtcctagagtgcttggtttatatattgta
[15414, 23408, 24098, 30791, 31368, 31635, 32506, 37972]
[7994, 690, 6693, 577, 267, 871, 5466]
