In [2]:
import numpy as np
import glob

def file_to_mat(file_,sep_str=',',d_type=None,skip_first=False):
    """
    Converts .csv files to a list of its entries
    Inputs:
    file_ - the location of a .csv file
    Returns:
    lines - a list of the lines in the file, each of which itself a list of all entries in the line
    """
    lines = [ln for ln in open(file_,'r')]
    def refine_line(ln,d_type=None,skip_first=False): ###separates the data into its constituents
        splits = ln[:-1].split(sep_str)
        start_=(1 if skip_first else 0)
        if d_type is None:
            return [ln_ for ln_ in splits[start_:]]
        if d_type=='int':
            return [np.nan if ln_=='' else int(ln_) for ln_ in splits[start_:]]
        if d_type=='float':
            return [np.nan if ln_=='' else float(ln_) for ln_ in splits[start_:]]
    d_type_arr,skip_arr=[],[]
    for i in range(len(lines[1:])):
        d_type_arr.append(d_type)
        skip_arr.append(skip_first)
    lines = map(refine_line,lines[1:],d_type_arr,skip_arr)
    return lines

def data_to_tads(data):
    """
    Takes split data read by file_to_mat and converts it into TAD boundaries
    Inputs:
    data - the data returned from file_to_mat
    Returns:
    tads - a list of TADs and their boundaries
    """
    iTAD,prev_chr=0,-1
    tads=[]
    for chr_,start,end in data:
        if chr_[3:]=='X':
            ichr=0
        else:
            ichr=int(chr_[3:])
        if ichr!=prev_chr:
            iTAD=1
        else:
            iTAD+=1
        prev_chr=ichr
        start_=int(start)
        end_=int(end)
        mid_=int((start_+end_)/2)
        tad=[ichr,iTAD,start_,end_,mid_]
        tads.append(tad)
    return tads

def binner(raw_data,bin_sz):
    binned=[]
    for i in range(len(raw_data)/bin_sz):
        row_ind=range(i*bin_sz,(i+1)*bin_sz)
        binned_row=[]
        for j in row_ind:
            row_=[]
            for k in range(len(raw_data)/bin_sz): 
                col_ind=range(k*bin_sz,(k+1)*bin_sz)
                bin_=np.array(raw_data[j][col_ind])
                sum_=np.sum(bin_)
                row_.extend([sum_])
            binned_row.append([row_])
            binned_row=list(np.sum(binned_row,axis=0))
        binned.append(binned_row)
    return np.array(binned)

In [68]:
import cPickle as pickle
file_='S3_TADs.csv'
mat=file_to_mat(file_)
tad_list=data_to_tads(mat)
pickle.dump(tad_list,open('IMR90_TADs.pkl','wb'))

TypeError: file_to_mat() takes at least 2 arguments (1 given)

In [6]:
import cPickle as pickle
import glob
loc='../../../../../Documents/IMR90.norm/hIMR90/nij/'
files_=glob.glob(loc+'/*')
for num_fl,file_ in enumerate(files_):
    chr_=np.array(file_to_mat(file_,sep_str='\t',d_type='float',skip_first=True))
    pickle.dump(chr_,open(loc+'IMR90_norm_chr'+str(num_fl)+'.pkl','wb'))

KeyboardInterrupt: 

In [3]:
import matplotlib.pyplot as plt
binned=binner(np.array(chr1),bin_sz=60)
binned.shape
#plt.imshow(binned,cmap='hot',interpolation='nearest')
#plt.show()

NameError: name 'chr1' is not defined

In [151]:
binned

array([[[ 21578.002757,   2105.369668,   1277.571807, ...,    143.515024,
            225.319072,    187.064804]],

       [[  1794.974587,  23108.665807,   9246.453414, ...,    248.729108,
            354.610537,    405.67323 ]],

       [[  1245.414632,   8711.726707,  26407.491934, ...,    268.960397,
            427.472461,    385.155534]],

       ..., 
       [[   139.347987,    246.83814 ,    266.485907, ...,  22428.875117,
           6183.751837,   3526.095448]],

       [[   228.283919,    358.938158,    432.431418, ...,   5600.20432 ,
          35806.940142,   5329.232168]],

       [[   174.691408,    393.594608,    378.657853, ...,   3479.654586,
           4725.045093,  22567.214729]]])

In [109]:
arr=np.array([[0,1,2],[3,4,5],[6,7,8]])
block_ind=[0,1]
arr[1:3]
#np.sum(np.array(arr[block_ind][block_ind]))

array([6, 7, 8])

In [138]:
a=np.array([1,2,3,4])
b=np.sum(a)
#i.extend(np.array([3,4])
c=[]
c.extend([b])
c

[10]