In [43]:
import sys
import os.path
import itertools
import pandas as pd
import numpy as np



In [79]:
def is_int(s):
    try:
        val = int(s)
        if val < 2:
            raise ValueError('nmer should be greater than 1')
        return True
    except ValueError:
        return False 
    
class ErvSummary:
    """summary for nmer motifs
    requires pandas, numpy"""
    def _countERV(self, filename, center):
        """count nERV from target file,
        center: center base position in the summary data (starting from 0)"""
        print('counting ERV...')
        moves_start = list(range(0,-nmer,-1))
        moves_skip = list(range(0,nmer))
        
        with open(filename) as f:
            for line in itertools.islice(f, 1, None):
                s , m = str.split(line)[4:6]
                for i in range(len(self.data)): 
                    temp = s
                    s = s[center+moves_start[i]:center+moves_start[i]+nmer] # skip center@@
                    s = s[:moves_skip[i]]+s[moves_skip[i]+1:]
                    # print('i={},s={},m={}'.format(i,s,m))
                    self.data[i]['nERVs'][df.mtypes.index(m)*6+df.subtypes.index(s)+1]+=1
                    s = temp
    
 
        
        
    def __init__(self, nmer, ervfile, reffile, center):
        if is_int(nmer) == False:
            raise ValueError('nmer should be integer greater than 1')
        
        self.patterns = set([''.join(i) for i in itertools.permutations('X'*(nmer-1)+'*')])
        self.mtypes = ['AT_CG', 'AT_GC', 'AT_TA', 'GC_AT', 'GC_CG', 'GC_TA']
        self.subtypes = [''.join(i) for i in itertools.product('ACGT', repeat = (nmer-1))]
        self.data = []
        for i in range(0, nmer):
            self.data.append(pd.DataFrame(np.zeros((4 ** (nmer-1) * 6, 4),dtype=np.int32),
                                        columns=['mtype', 'subtype', 'nERVs', 'nMotifs']))                                          
            self.data[i]['mtype'] = list(itertools.chain.from_iterable(itertools.repeat(x,4 ** (nmer-1)) for x in self.mtypes))
            self.data[i]['subtype'] = self.subtypes * 6
        
        if center is None:
            center = 3
        
        if ervfile is not None:
            if os.path.isfile(ervfile)==False:
                raise ValueError('{} is not a file'.format(ervfile))
                
            self._countERV(ervfile, center)
            print('counting ERV completed')
        else:
            print('erv not counted as ervfile is None')
        
        if reffile is not None:
            #####################################
            #### count rel rate from relfile ####
            #####################################
            pass
        else:
            print('reference motifs not counted as reffile is None')
            
        if ((ervfile is not None) & (reffile is not None)):
            print('counting relrate and wt ...')
            total_motifs = np.sum(self.data[0].nMotifs)
            for i in range(0, nmer):
                self.data[i]['ERV_rel_rate'] = self.data[i].nERVs / self.data[i].nMotifs
                self.data[i]['wt'] = self.data[0].nERVs / total_motifs
            print('counting relrate and wt completed')
     
    def writeERV(self, dir):
        print('writing data to {}...'.format(dir))
        if os.path.isdir(dir)==False:
            raise ValueError('{} is not a directory'.format(dir))
        if dir.endswith('/')==False:
            dir = dir+'/'
        for i in range(0, len(self.data)):
            self.data[i].to_csv(dir+'{}mer_{}.txt'.format(len(self.data),i), sep=' ', index=False, header=True)
        print('writing data to {} complete'.format(dir))
    
        

In [85]:
test = ErvSummary(3, 'erv_sample_chr1.sites.txt')
test.writeERV('data')


counting ERV...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


counting ERV completed
reference motifs not counted as reffile is None
writing data to data...
writing data to data/ complete


In [71]:
if t == '':
    t = None
t is None

True

In [26]:

df = test
center = 3 # center base position in the summary data (starting from 0)
moves_start = [0, -1, -2]
moves_skip = [0, 1, 2]
nmer = 3
 
with open('erv_sample_chr1.sites.txt') as f:
    for line in itertools.islice(f, 1, None):
        s , m = str.split(line)[4:6]
        for i in range(len(test.data)): ##editing!!
            temp = s
            s = s[center+moves_start[i]:center+moves_start[i]+nmer] # skip center@@
            s = s[:moves_skip[i]]+s[moves_skip[i]+1:]
            # print('i={},s={},m={}'.format(i,s,m))
            df.data[i]['nERVs'][df.mtypes.index(m)*6+df.subtypes.index(s)+1]+=1
            s = temp
df.data[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,mtype,subtype,nERVs,nMotifs
0,AT_CG,AA,0,0
1,AT_CG,AC,0,0
2,AT_CG,AG,6,0
3,AT_CG,AT,4,0
4,AT_CG,CA,8,0
5,AT_CG,CC,0,0
6,AT_CG,CG,0,0
7,AT_CG,CT,0,0
8,AT_CG,GA,4,0
9,AT_CG,GC,0,0


In [19]:
 for i in range(len(test.data)):
        print('{}: {}'.format(i,df.data[i]['nERVs'][0]))

0: 0
1: 0
2: 0
