In [20]:
# 包调用
import os
import math
import pickle
import pandas as pd
# math下的pow默认float，python自带pow默认int
import numpy as np
from collections import Counter
# count = Counter(seq)

In [45]:
# 默认参数
global MASSINI
global MDPKANTERM
global MDPKACTERM
global MDPI
global AASEQ
global MASSDIC
global INDEXDIC
global PKADIC
MASSINI=18.105
MDPKANTERM=9.69
MDPKACTERM=2.34
MDPI=3.14159265358979323846
AASEQ="ACDEFGHIKLMNPQRSTVWY"
MASSDIC={'A':71.0788,'C':103.1388,'D':115.0886,'E':129.1155,'F':147.1766,'G':57.0519,'H':137.1411,'I':113.1594,'K':128.1741,'L':113.1594,'M':131.1926,'N':114.104,'P':97.1167,'Q':128.1307,'R':156.1875,'S':87.0782,'T':101.1051,'V':99.1326,'W':186.2132,'Y':163.1760}
INDEXDIC=dict(zip([i for i in "ACDEFGHIKLMNPQRSTVWY"],range(20)))
PKADIC={'R':12.4,'K':10.5,'H':6.0,'C':8.33,'D':3.86,'E':4.25,'Y':10.0}

In [64]:
class GetFeature:
    def __init__(self,seq,seq_with_flanking_region=""):
        self.mass_ini= MASSINI
        self.m_dPKa_N_TERM = MDPKANTERM
        self.m_dPKa_C_TERM = MDPKACTERM
        self.m_dPi = MDPI
        self.mass_dic = MASSDIC
        self.index_dic = INDEXDIC
        self.pka_dic = PKADIC
        self.AAseq = "ACDEFGHIKLMNPQRSTVWY"
        self.seq = seq
        self.seq_with_flanking_region=seq_with_flanking_region
        self.len = len(seq)
        self.counter = Counter(seq)

        print("Used to generate the corresponding feature of the AA sequence")

    def get_mass(self):
        mass = sum([self.mass_dic[i] for i in self.seq]) + self.mass_ini
        return mass

    def get_missing_cleavage_number(self):
        count=self.seq.count("KP")+self.seq.count("RP")
        return count

    def get_net_charge(self,pH):
        ten2pH = math.pow(10,pH)
        netZ=0.0
        sign=1

        ten2PKa = math.pow(10,self.m_dPKa_N_TERM)
        netZ = ten2PKa / (ten2pH + ten2PKa)
        ten2PKa = math.pow(10,self.m_dPKa_C_TERM)
        netZ -= ten2pH / (ten2pH + ten2PKa)
        for i in self.seq:
            if i=='R' or i=='K' or i=='H':
                ten2PKa=self.pka_dic[i]
                netZ += (ten2PKa / (ten2pH + ten2PKa))
            elif i=='C' or i=='D' or i=='E' or i=='Y':
                ten2PKa=self.pka_dic[i]
                netZ +=  -1*(ten2pH / (ten2pH + ten2PKa))
        return netZ
        
    def get_sum_neutral_residues(self):
        neutralCount = self.len - sum([self.counter[i] for i in ['E','D','R','K','H']])
        return neutralCount 

    def get_sum_basic_residues(self):
        basicCount = sum([self.counter[i] for i in ['R','K','H']])
        return basicCount

    def get_proportion_large_sized_residues(self):
        count = sum([self.counter[i] for i in ['M','I','L','K','R','F','Y','W']])
        return float(count)/float(self.len)
    
    def get_proportion_small_sized_residues(self):
        count = sum([self.counter[i] for i in ['C','P','T','D','N','A','G','S']])
        return float(count)/float(self.len)

    def get_masslen_ratio(self):
        return self.get_mass() / float(self.len)
    
    def get_vihinen_flexblity(self):
        result=0.0
        aa="ACDEFGHIKLMNPQRSTVWYXBZ"
        FlexScales=[0.984, 0.906, 1.068, 1.094, 0.915, 1.031, 0.950, 0.927, 1.102, 0.935, 0.952, 1.048, 1.049, 1.037, 1.008, 1.046, 0.997, 0.931, 0.904, 0.929, 0.9906, 1.068, 1.094]
        window=[0.25, 0.4375, 0.625, 0.8125, 1, 0.8125, 0.625, 0.4375, 0.25]
        windowSum=5.25
        windowSize=9
        for stridx in range(self.len):
            subFlexScales=[]
            Max=max(0,stridx-4)
            Min=min(stridx+4,seqlen-1)
            for idx in range(Max,Min+1):
                subFlexScales.append(FlexScales[aa.index(self.seq[idx])])
            if stridx < 4:
                begin = 4 - stridx
                end =windowSize - 1
            elif self.len - stridx < 5:
                begin = 0
                end = windowSize -6 + self.len - stridx
            else:
                begin = 0
                end = windowSize - 1
            if (end-begin+1) != len(subFlexScales):
                print("Vector wind's size is not equal to Vector subFlexScales's size")
            windSum=0
            for idx in range(begin,end+1):
                windSum += window[idx]
            proportion = windowSum / windSum

            temp=0.0
            for idx in range(len(subFlexScales)):
                temp += subFlexScales[idx] * window[idx+begin] * proportion
            result += temp
        return result / self.len

    def get_hydrophobic_moment(self,window,angle):
        result=0
        aa="ACDEFGHIKLMNPQRSTVWY"
        aaIndex=[]
        hydrophobicIndicies=[0.25, 0.04, -0.72, -0.62, 0.61, 0.16, -0.40, 0.73, -1.10, 0.53, 0.26, -0.64, -0.07, -0.69, -1.76, -0.26, -0.18, 0.54, 0.37, 0.02]
        halfWindow=math.floor(window / 2.0)
        angle = angle * self.m_dPi /180
        for idx in range(self.len):
            aaIndex.append(aa.index(self.seq[idx]))
        for idx in range(seqlen):
            hmSin=hmCos=0
            Max=max(0,idx-halfWindow)
            Min=min(idx+halfWindow,self.len - 1)
            for idx2 in range(Max,Min + 1):
                hmSin += math.sin(angle * idx2) * hydrophobicIndicies[aaIndex[idx2]]
                hmCos += math.cos(angle * idx2) * hydrophobicIndicies[aaIndex[idx2]]
            temp = Min - Max + 1
            result += math.sqrt(hmSin*hmSin + hmCos*hmCos) / min(temp,window)   
        return result / self.len

    def filter(self,ord,a,b,nup,x,y):
        y[0] = b[0] * a[0]
        for i in range(ord + 1):
            y[i]=0.0
            for j in range(i+1):
                y[i] += b[j] * x[i-j]
            for j in range(i):
                y[i] -= a[j+1] * y[i-j-1]
        for i in range(ord+1,nup):
            y[i]=0.0
            for j in range(ord+1):
                y[i] += b[j] * x[i - j]
            for j in range(ord):
                y[i] -= a[j + 1] * y[i - j - 1]

    def filter_twosided(self,matrix_protein,matrixRow,matrixColumn,filt_onesided,filtOnesideSize):
        denominator=[0.0 for i in range(filtOnesideSize)]
        biggerLength =max(filtOnesideSize,matrixRow)
        one=[0.0 for i in range(biggerLength)]
        scale=[0.0 for i in range(biggerLength)]

        for i in range(filtOnesideSize):
            denominator[i]=0
        denominator[0]=1

        for i in range(matrixRow):
            one[i]=1
        
        self.filter((filtOnesideSize-1), denominator, filt_onesided, matrixRow, one, scale)
        # 取整问题，c++取法等价于int
        for i in range(int(matrixRow/2)):
            scale[i] = (scale[i]+scale[matrixRow-1-i]) / 2.0
            scale[matrixRow-1-i]=scale[i]

        data=[0.0 for i in range(biggerLength)]
        fdata=[0.0 for i in range(biggerLength)]
        fdata2=[0.0 for i in range(biggerLength)]

        for i in range(matrixColumn):
            for j in range(matrixRow):
                data[j]=matrix_protein[j][i]

            self.filter((filtOnesideSize-1), denominator, filt_onesided, matrixRow, data, fdata)

            for j in range(int(matrixRow/2)):
                tmp=data[j]
                data[j]=data[matrixRow-1-j]
                data[matrixRow-1-j]=tmp

            self.filter((filtOnesideSize-1), denominator, filt_onesided, matrixRow, data, fdata2)

            for j in range(matrixRow):
                fdata[j]=(fdata[j]+fdata2[matrixRow-1-j])/2.

            for j in range(matrixRow):
                matrix_protein[j][i]=fdata[j]/scale[j]
    
    def filter_error(self,pfilt,pfiltRow,pfiltColumn,data,dataLength,filt_onesided,filtOnesidedSize):
        denominator=[0.0 for i in range(filtOnesidedSize)]
        biggerLength=max(filtOnesidedSize,dataLength)
        one=[0.0 for i in range(biggerLength)]
        scale=[0.0 for i in range(biggerLength)]
        for i in range(dataLength):
            one[i]=1

        for i in range(filtOnesidedSize):
            denominator[i]=0
        denominator[0]=1

        self.filter((filtOnesidedSize-1),denominator,filt_onesided,dataLength,one,scale)
        for i in range(int(dataLength/2)):
            scale[i]=(scale[i]+scale[dataLength-1-i])/2.0
            scale[dataLength-1-i]=scale[i]

        fdata=[0.0 for i in range(biggerLength)]
        fdata2=[0.0 for i in range(biggerLength)]
        self.filter((filtOnesidedSize-1),denominator,filt_onesided,dataLength,data,fdata)

        for i in range(int(dataLength/2)):
            tmp=data[i]
            data[i]=data[dataLength-1-i]
            data[dataLength-1-i]=tmp
        self.filter((filtOnesidedSize - 1), denominator, filt_onesided, dataLength, data, fdata2)

        for i in range(dataLength):
            fdata[i]=(fdata[i]+fdata2[dataLength-1-i])/2.

        for i in range(pfiltRow):
            pfilt[i][pfiltColumn]=fdata[i]/scale[i]
    
    def filter_prot(self,pred,pfilt,pfiltRow,pfiltColumn,filt_onesided,filtOnesidedSize):
        biggerLength=max(filtOnesidedSize,pfiltRow)
        p=[0.0 for i in range(biggerLength)]
        for i in range(pfiltRow):
            p[i]=pred[i]
        self.filter_error(pfilt,pfiltRow,pfiltColumn,p,pfiltRow,filt_onesided,filtOnesidedSize)

    def predict_linear(self,pfilt,pfiltRow,pfiltColumn):
        lst = [-25.360122772689530, -0.696903041342054, 1.759821058633487, 0.088936843652238, 1.816910774835402, 2.580379123489578, 1.199846997885154, -0.095699661003024, 1.098553523477732, 2.636616803052733, -1.665418976414453, 1.352861445754487, -0.580508778536282, 2.745993680647693, -1.085823401526897, 0.740717492440777, 0.533128494727820, -4.586324163702638, 0.739585357799028, 24.425480517325200, 0.230030178184243,-14.300860682065565, 1.434701221055386, -1.028707033124128, -3.013912732313829, -2.376741271298584, 1.273834113496491, -0.964875311764001, -1.094035314236643, -1.044061215564152, 2.773847613995727, -0.553544312680591, -0.152639027401581, 0.678673838499450, 1.016487071159430, -0.165345043318697, 0.014607300630189, 0.925090573506023, 2.515930202256339, 0.974841031279123, 15.431464279382766, -0.221008947365943,-30.337058421955790, 0.854440689066855, 5.339030779290762, 1.010144293991523, 3.381492895434795, 1.206914784027385, 2.371572902795911, -0.850297513246717, 3.987712522463211, 4.790343477250348, 1.081410650555920, -0.169321385683294, 1.554378338506579, 3.520055521634506, 2.432348578718518, 1.738359018604442, 3.223434358612503, 1.679678304063250, -1.456869987959105, 30.284784500965750, -0.354270870941984,-25.970718651350940, 0.683574136190159, 3.082311205666289, -0.370673832896873, 1.626657146800731, 2.486017822353065, 1.360178852772070, -0.732837806618001, 1.769823119646163, 4.086509434704226, -0.328578025097438, 0.252295819229421, 0.431583241825593, 2.009388278166486, 0.456480475170492, 0.666658611714632, 1.805648436817136, -0.545838679329645, -0.127664204931993, 26.489721415658103, -0.257148340210382]
        modelsBeta = np.array(lst).reshape(4,21)
        W_IN=21
        W_OUT=21
        filt_onesided = [0.0 for i in range(W_IN)]
        for i in range(W_IN):
            filt_onesided[i] = 1.0
        filt_onesided[0] = 0.5

        data=[0.0 for i in range(self.len)]
        for i in range(self.len):
            data[i] = [0.0 for i in range(26)]
        
        for i in range(self.len):
            for j in range(26):
                data[i][j] = 0

        self.proteinFlatFilter(data,self.len,26,self.seq,filt_onesided,W_IN)
        entropy = 0
        temp = 0
        for i in range(self.len):
            # e = sum(f(x) for x in data[i])
            entropy= 0
            for j in range(20):
                temp = data[i][j]
                if temp==0:
                    continue
                entropy += temp * math.log(1./temp) / math.log(2.)
            data[i][21]=entropy

        for i in range(W_OUT):
            filt_onesided[i]=1
        filt_onesided[0]=0.5

        lmod=4
        sum=0
        Attributes=[0, 1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
        p=[0.0 for i in range(self.len)]
        for i in range(lmod):
            for j in range(self.len):
                sum=modelsBeta[i][0]
                for k in range(20):
                    sum += data[j][Attributes[k]] * modelsBeta[i][k+1]
                p[j]=sum
            self.filter_prot(p,pfilt,pfiltRow,i,filt_onesided,W_OUT)

    def getVL2Disorder(self):
        pfilt=[[0.0 for j in range(4)] for i in range(self.len)]
        # matrix =  np.zeros((len(seq), 4))
        self.predict_linear(self.seq,pfilt,self.len,4)
        result=0
        vec=[]
        for i in range(4):
            result=0
            for j in range(self.len):
                result += pfilt[j][i]
            result /= self.len
            vec.append(result)
        return vec

    def get_Bfactor_prediction(self,Win,Wout):
        result=0
        meanv=[0.082968313140730, 0.011736564150357, 0.067809878844363, 0.064588381484934, 0.033886300093196, 0.088146940043496, 0.022098477788133, 0.045552966759862, 0.064153463808637, 0.074777881329608, 0.020540540540540, 0.051916744330536, 0.053502640571605, 0.039798073936004, 0.044307238272754, 0.067135756446103, 0.061397949673812, 0.061351351351352, 1.006507096924509, 2.053169508147902]
        stdv=[0.130331719672796, 0.049309161430501, 0.112881088280996, 0.110767901251959, 0.081279119138629, 0.128216021352435, 0.066946723059315, 0.094605357612097, 0.112588332693937, 0.119775107696711, 0.065689847423736, 0.101495390561351, 0.101360742817996, 0.089508175324723, 0.093472444265776, 0.116208004856971, 0.110493688968059, 0.109700608158085, 0.026172472801445, 0.309130677346108]
        resultsBeta=[-0.127150878905927, 0.051137322783183, -0.284651567902609, -0.430461262094386, -0.021608943274078, -0.154792867992833, 0.049308110391981, -0.104625789276418, -0.467049531281995, 0.009179581249833, -0.005886075350205, -0.219643270163331, -0.176474696960102, -0.220613294186365, -0.150917721214844, -0.253207400169961, -0.121781926993112, -0.056917028712712, 1.137733284555524, -0.185855943167836, 0.0003732630317847297]
        lst=[65, 67, 68, 69, 70, 71, 72, 73, 75, 76, 77, 78, 80, 81, 82, 83, 84, 86, 87, 89,0.984000, 0.906000, 1.068000, 1.094000, 0.915000, 1.031000, 0.950000, 0.927000, 1.102000, 0.935000, 0.952000, 1.048000, 1.049000, 1.037000, 1.008000, 1.046000, 0.997000, 0.931000, 0.904000, 0.929000,1.800000, 2.500000, -3.500000, -3.500000, 2.800000, -0.400000, -3.200000, 4.500000, -3.900000, 3.800000, 1.900000, -3.500000, -1.600000, -3.500000, -4.500000, -0.800000, -0.700000, 4.200000, -0.900000, -1.300000,4.970000, 7.490000, 3.990000, 3.970000, 5.990000, 4.720000, 6.110000, 5.920000, 4.470000, 6.000000, 4.890000, 5.020000, 3.710000, 4.820000, 4.750000, 4.310000, 5.000000, 5.600000, 5.810000, 6.130000]
        FHC=np.array(lst).reshape(4,20)

        xtemp=[0.0 for i in range(self.len)]
        for i in range(self.len):
            xtemp[i]=[0.0 for i in range(26)]
        for i in range(self.len):
            for j in range(26):
                xtemp[i][j]=0
        
        self.make_attribute(xtemp,self.len,26,self.seq,Win)
        indexMS=0
        for i in range(self.len):
            indexMS=0
            for j in range(24):
                if j in [18,19,21,22]:
                    continue
                xtemp[i][j]=xtemp[i][j]-meanv[indexMS]

        raw_prediction=[0.0 for i in range(self.len)]
        for i in range(self.len):
            indexResultsBeta=0
            sum=0
            for j in range(24):
                if j in [18,19,21,22]:
                    continue
                sum += xtemp[i][j] * resultsBeta[indexResultsBeta]
                indexResultsBeta+=1
            sum += resultsBeta[indexResultsBeta]
            raw_prediction[i]=1./(1+math.exp(-sum))

        prediction=[0.0 for i in range(self.len)]
        for i in range(self.len):
            prediction[i]=0
        self.moving_average(raw_prediction,Wout,prediction,self.len)
        for i in range(self.len):
            result += prediction[i]
        
        return result / self.len

    def moving_average(self,x,window,y,length):
        if window>length:
            for i in range(length):
                Max = max(i-int(math.floor(window/2.)),0)
                Min = min(i+int(math.floor(window/2.)),length-1)
                sum=0
                for j in range(Max,Min+1):
                    sum+=x[j]
                    y[i] = sum / (Min-Max+1)
            return
        else:
            one_sided_window = math.floor(window/2.)+1
            t=0.0
            for i in range(one_sided_window):
                t += x[i]
            for i in range(one_sided_window-1):
                y[i] = t/(one_sided_window+i)
                t += x[one_sided_window+i]
            for i in range(one_sided_window,length-one_sided_window):
                y[i] = t / window
                t = t - x[i-one_sided_window+1] + x[one_sided_window+i]

            j=0
            for i in range(length-one_sided_window,length):
                y[i] = t / (window - j)
                t = t - x[i - one_sided_window + 1]
                j += 1
            return

    def make_attribute(self,data,dataRow,dataColumn,seq,W_IN):
        filt_onesided=[0.0 for i in range(W_IN)]
        for i in range(W_IN):
            filt_onesided[i] = 1.0
        filt_onesided[0] = 0.5
        
        self.protein_flat_filter(data, dataRow, dataColumn, seq, filt_onesided, W_IN)

        entropy=0.0
        for i in range(dataRow):
            entropy = 0.0
            for j in range(20):
                temp = data[i][j]
                if temp == 0:
                    continue
                entropy += temp * math.log(1./temp) / math.log(2.)
            data[i][23] = entropy
            
    def get_number_nonpolarhydrophobic_residues(self):
        count = sum([self.counter[i] for i in ['F','A','L','M','I','W','P','V']])
        return count
 
    def get_number_polarhydrophobic_residues(self):
        count = sum([self.counter[i] for i in ['D','E','R','K','H','C','G','Q','N','S','Y','T']])
        return count

    def get_number_unchargedpolarhydrophilic_residues(self):
        count = sum([self.counter[i] for i in ['C','G','Q','N','S','Y','T']])
        return count

    def get_number_chargedpolarhydrophilic_residues(self):
        count = sum([self.counter[i] for i in ['D','E','R','K','H']])
        return count

    def get_aafreq(self):
        frequency = [self.counter[i] / self.len for i in self.AAseq]
        return frequency

    def get_seq_complexity(self,frequency):
        entropy = 0
        for i in frequency:
            if i == 0:
                continue
            entropy += i * math.log(i) / math.log(2.0)
        return -entropy*1000

    def proteinFlatFilter(self,matrix_protein,matrixRow,matrixColumn,seq,filt_onesided,filtOnesidedSize):
        lst=[65, 67, 68, 69, 70, 71, 72, 73, 75, 76, 77, 78, 80, 81, 82, 83, 84, 86, 87, 89,0.984000, 0.906000, 1.068000, 1.094000, 0.915000, 1.031000, 0.950000, 0.927000, 1.102000, 0.935000, 0.952000, 1.048000, 1.049000, 1.037000, 1.008000, 1.046000, 0.997000, 0.931000, 0.904000, 0.929000,1.800000, 2.500000, -3.500000, -3.500000, 2.800000, -0.400000, -3.200000, 4.500000, -3.900000, 3.800000, 1.900000, -3.500000, -1.600000, -3.500000, -4.500000, -0.800000, -0.700000, 4.200000, -0.900000, -1.300000,4.970000, 7.490000, 3.990000, 3.970000, 5.990000, 4.720000, 6.110000, 5.920000, 4.470000, 6.000000, 4.890000, 5.020000, 3.710000, 4.820000, 4.750000, 4.310000, 5.000000, 5.600000, 5.810000, 6.130000]
        FHC=np.array(lst).reshape(4,20)
        seqASCII=[]
        seqUniqueASCII=[]
        for i in range(self.len):
            temp = ord(self.seq[i])
            seqASCII.append(temp)
            if temp not in seqUniqueASCII:
                seqUniqueASCII.append(temp)
        for i in seqUniqueASCII:
            for j in range(20):
                if FHC[0][j] == i:
                    break
            for k in range(len(seqASCII)):
                if seqASCII[k] != i:
                    continue
                matrix_protein[k][j]=1
                matrix_protein[k][20]=FHC[1][j]
                matrix_protein[k][21]=FHC[2][j]
                matrix_protein[k][22]=FHC[3][j]
        
        for i in range(matrixRow):
            matrix_protein[i][23]=0
            matrix_protein[i][24]=1
            matrix_protein[i][25]=(i+1)

        self.filter_twosided(matrix_protein,self.len,23,filt_onesided,filtOnesidedSize)

    def gen_feature(self):
        feature=[]
        feature.append(self.len)
        feature.append(self.get_missing_cleavage_number())
        feature.append(self.get_mass())
        self.frequency=self.get_aafreq()
        for i in self.frequency:
            feature.append(i)
        with open("/home/guruichu/dprotein/script/aaindex.pkl",'rb') as p:
            aaindex_dic=pickle.load(p)   
        count=0
        featureValueTemp=0.0
        for i in range(544):
            count=0
            featureValueTemp=0.0
            for j in self.seq:
                if aaindex_dic[j][i] != np.nan:
                    featureValueTemp += aaindex_dic[j][i]
                    count += 1
                else:
                    print(1)
            featureValueTemp = featureValueTemp / count
            feature.append(featureValueTemp)
        feature.append(self.get_net_charge(7))
        feature.append(self.get_number_nonpolarhydrophobic_residues())
        feature.append(self.get_number_polarhydrophobic_residues())
        feature.append(self.get_number_unchargedpolarhydrophilic_residues())
        feature.append(self.get_number_chargedpolarhydrophilic_residues())

        feature.append(self.get_sum_basic_residues())
        feature.append(self.get_sum_neutral_residues())
        feature.append(self.get_proportion_large_sized_residues())
        feature.append(self.get_proportion_small_sized_residues())

        feature.append(self.get_masslen_ratio())
        feature.append(self.get_seq_complexity(self.frequency))

        if self.seq_with_flanking_region == "":
            pass
        else:
            feature.append(self.get_vihinen_flexblity(self.seq_with_flanking_region))
            feature.append(self.get_hydrophobic_moment(self.seq_with_flanking_region,11,160))
            feature.append(self.get_hydrophobic_moment(self.seq_with_flanking_region,11,120))
            vl2 = self.getVL2Disorder(self.seq_with_flanking_region)
            for i in vl2:
                feature.append(i)
        return feature

In [65]:
x=GetFeature("ACATCATTTCATC")
x.gen_feature()

Used to generate the corresponding feature of the AA sequence


[13,
 0,
 1220.5009,
 0.3076923076923077,
 0.3076923076923077,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.38461538461538464,
 0.0,
 0.0,
 0.0,
 4.4423076923076925,
 0.5361538461538462,
 1.2676923076923077,
 1.2084615384615387,
 0.4953846153846154,
 0.7561538461538462,
 0.6607692307692308,
 0.3870769230769231,
 64.58461538461539,
 42.0,
 20.46153846153846,
 -0.2769230769230768,
 -2.7769230769230777,
 -0.4,
 0.6565384615384615,
 8.263384615384616,
 4.451538461538461,
 7.023076923076924,
 0.31569230769230777,
 0.40184615384615385,
 0.5430769230769232,
 0.09507692307692309,
 1.2806153846153845,
 1.0,
 0.053076923076923084,
 1.0769230769230769,
 0.0,
 0.0,
 0.6923076923076923,
 0.0,
 0.3076923076923077,
 111.33076923076922,
 130.76923076923077,
 31.615384615384617,
 0.3438461538461539,
 0.16000000000000003,
 0.9384615384615382,
 0.9715384615384615,
 1.079230769230769,
 0.8999999999999999,
 1.0153846153846156,
 0.9992307692307691,
 0.83461538461538

[13,
 None,
 1220.5009,
 0.3076923076923077,
 0.3076923076923077,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.38461538461538464,
 0.0,
 0.0,
 0.0,
 4.451666666666666,
 1.1024999999999998,
 0.9558333333333332,
 0.8958333333333334,
 0.3766666666666667,
 0.6491666666666666,
 0.7716666666666666,
 0.432,
 76.78333333333335,
 7.166666666666667,
 -20.666666666666668,
 -0.6583333333333333,
 -1.583333333333334,
 -0.5025,
 0.6967500000000001,
 6.23,
 4.468916666666666,
 5.241666666666666,
 0.29450000000000004,
 0.3729166666666666,
 0.5433333333333333,
 0.14216666666666666,
 0.42533333333333334,
 1.1616666666666666,
 0.030583333333333337,
 0.6666666666666666,
 0.25,
 0.25,
 1.4166666666666667,
 0.0,
 0.5,
 132.5583333333333,
 152.91666666666666,
 49.0,
 0.24583333333333332,
 0.105,
 1.1149999999999998,
 0.8983333333333334,
 0.8733333333333334,
 1.1175,
 1.1191666666666666,
 0.9491666666666667,
 0.8416666666666667,
 1.14,
 0.8650000000000001,
 0.763333333

In [28]:
sum([Counter(seq)[i] for i in ['E','D','R','K','H']])

3

In [40]:
pfilt=[[0.0 for j in range(4)] for i in range(5)]
# matrix =  np.zeros((len(seq), 4))
for i in range(5):
    pfilt[i]=[0.0 for i in range(4)]

In [41]:
pfilt

[[0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0]]