In [3]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from multiprocessing import Pool
from Bio import SeqIO
from Bio import motifs
from Bio.Seq import Seq
import os 

In [15]:
from os import getpid
from functools import partial

num_partitions = 24 #number of partitions to split dataframe 22
num_cores = 24 #number of cores on your machine 22

def convert_amp(df):
    process_id =str(getpid())
    path = "/mnt/vdb/Bat/pws/tranpi/reps"
    pro_plk= path + "chunk"+process_id+".pkl"
    if not os.path.exists(path):
        os.makedirs(path)
    
    print(process_id+" Start")

    char_code_list = [char for char in "ACDEFGHIKLMNPQRSTVWY"] 

    ready_df = pd.DataFrame(columns = ['ID', 'Sequence','reps', 'length'])
    for seq_record in df.iterrows():
        #print(seq_record)
        m = motifs.create([ seq_record[1]["Sequence"] ] , alphabet="ACDEFGHIKLMNPQRSTVWY")  
        index_pattern = [char for char in m.consensus]
        pwm = m.counts.normalize(pseudocounts=0.5)
        pssm_df = pd.DataFrame.from_dict(pwm)
        pssm_df.index = index_pattern
        result_df = pd.DataFrame(0, columns=['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'],
        index = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'])
        for  char in char_code_list: # column
            for index, row in pssm_df.iterrows():
                result_df.loc[index, char] += row[char]
        _flat = result_df.values.flatten()
 
        ready_df = ready_df.append({'ID': seq_record[1]["ID"], 'Sequence': seq_record[1]["Sequence"],
                                   'reps': _flat, 'length': seq_record[1]["length"]}, ignore_index=True) 
    print(process_id+" Finish")
    output = open(pro_plk, 'wb')
    pickle.dump(ready_df, output)
    output.close()

def parallelize_dataframe(df, func):
    _tmp_df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    pool.map(func, _tmp_df_split)

    pool.close()
    pool.join()

In [9]:
def get_df(fastas):
    with open(fastas) as fasta_file:  # Will close handle cleanly
        identifiers = []
        lengths = []
        seqs = []
        for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
            identifiers.append(seq_record.id)
            # Remove leading and trailing characters from a string
            seqs.append(str(seq_record.seq.strip('*')))
            lengths.append(len(seq_record.seq))
    # dictionary of lists  
    dict = {'ID': identifiers, 'Sequence': seqs, 'length': lengths}  
    df = pd.DataFrame(dict) 
    return df

## CD100

In [4]:
fastas = "/home/ubuntu/data/bk_fasta/SRR12103592.assembly.len10.cd100.fasta"
with open(fastas) as fasta_file:  # Will close handle cleanly
    identifiers = []
    lengths = []
    seqs = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        identifiers.append(seq_record.id)
        # Remove leading and trailing characters from a string
        seqs.append(str(seq_record.seq.strip('*')))
        lengths.append(len(seq_record.seq))
# dictionary of lists  
dict = {'ID': identifiers, 'Sequence': seqs, 'length': lengths}  
df = pd.DataFrame(dict) 
#df["Sequence"] =  seqs
#df.to_pickle(plk)    
df  

Unnamed: 0,ID,Sequence,length
0,9,EDTGFYPSEPMLCSESEEGQVPHSLETLYQSADCSSPSDALIVCIH...,275
1,19,ALGPSLWDRRRSLHLLLQEAFPVAQSLAQVIHHQFQTVSKQGGPLP,47
2,32,VGICGSDVHYWQHGRIGDFIVKKPMVLGHEASGTVVKVGSLVKHLQ...,151
3,54,EKPCNSNQQPLENLVEDTLINYSQFGSPKDHEHNGCKLCQTDRYCE...,199
4,61,AAKFVFRHNDPDHLEKLLKKSNSETPKIVAFETVHSMDGAICPLEE...,152
...,...,...,...
422791,12568817,GAWTEVGLPSQDVSVASCNCCRRPMHFELMSEWERSYFGNMGPQYV...,51
422792,12568833,LLLPLKVLGFLGGQLSVVVLHQPVHIIIIQLQAMDLEIFSSFAP,45
422793,12568849,GGQTPRTAPNPQNPPLPPFWAWSSQNVQFSFKFLILGAEPLDRFLA...,117
422794,12568850,VEESCTIENNSDSTKPKMAAEVDFGDLELFEAFDHPEESLPKPVHT...,174


In [None]:
parallelize_dataframe(df, convert_amp)

In [None]:
# merge


## Tranpi

In [17]:
df_tranpi = get_df("/mnt/vdb/Bat/transpi/SRR12103592.combined.okay.fa.transdecoder.pep")
NON_CODE = "B|Z|J|U|O|X"
# remove ambigous seqeunce and non-canonical amino acids codes
df_tranpi = df_tranpi[~df_tranpi["Sequence"].str.contains(NON_CODE, regex=True)]
df_tranpi

Unnamed: 0,ID,Sequence,length
0,SOAP.k25.C303246.p1,QKVLQAAGPSTTTETETIAKYEIMDGAPVKGESIPIRLFLAGYDPT...,101
1,SOAP.k25.C303708.p1,PITWGRKWNIENGCARTHSQDDYSPGSQAQGESGTASHPRRGHLEM...,102
2,SOAP.k25.C304032.p1,LRNHSPLMSFGASFVSFLNAMMTFEEEKMQLACDDLRTTEKLCESE...,103
3,SOAP.k25.C304284.p1,ESTDQISPYGNSTVTQPSDSGWQYNETHTSLKQNTPRNTSKLYIGL...,104
4,SOAP.k25.C304386.p1,LGPDLSWAWEAKQPWGQETSLRRGEGSGLCKVGGVRVCAPPLLTPK...,104
...,...,...,...
11000,Velvet.k61.NODE_783_length_328_cov_36.378048.p1,MSPSQAVYIVPSKGRLIGGLRDTPSYEHFQEDFSTCSLCTFRDLCA...,106
11001,Velvet.k61.NODE_8470_length_279_cov_15.172043.p1,QDLENAATGDAAVHQRIASLPVEVQEVSLLDKITDKESGERLSKMV...,105
11002,Velvet.k61.NODE_9141_length_765_cov_44.260132.p2,ARAALAMPVKGGTKCIKYLLLGFNFVFWLAGIAVLAIGLWLRFDSQ...,233
11003,Velvet.k61.NODE_9415_length_1250_cov_53.216000.p1,MSCKPQCSLNHLPTPCARQSAPFRIPAEFLYLVLLLVEGAPFSNFS...,367


In [18]:
parallelize_dataframe(df_tranpi, convert_amp)

5629 Start5632 Start5630 Start5631 Start5633 Start

5634 Start
5635 Start


5637 Start5636 Start
5638 Start


5639 Start
5640 Start
5641 Start
5642 Start5643 Start
5644 Start

5649 Start
5650 Start5651 Start5652 Start


5646 Start5647 Start

5648 Start
5645 Start
5652 Finish
5630 Finish
5643 Finish
5642 Finish
5641 Finish
5644 Finish
5649 Finish
5629 Finish
5636 Finish
5647 Finish
5645 Finish
5633 Finish
5639 Finish
5638 Finish
5640 Finish
5637 Finish
5646 Finish
5648 Finish
5632 Finish
5634 Finish
5631 Finish
5650 Finish
5651 Finish
5635 Finish


In [4]:
## Test
infile = "/mnt/vdb/Bat/pws/cd100/reps/_pro.9013.pkl"
data = pd.read_pickle(infile)
data

Unnamed: 0,ID,Sequence,reps,length
0,9,EDTGFYPSEPMLCSESEEGQVPHSLETLYQSADCSSPSDALIVCIH...,"[1.6363636363636358, 0.5454545454545455, 0.545...",275
1,19,ALGPSLWDRRRSLHLLLQEAFPVAQSLAQVIHHQFQTVSKQGGPLP,"[0.5454545454545454, 0.18181818181818182, 0.18...",47
2,32,VGICGSDVHYWQHGRIGDFIVKKPMVLGHEASGTVVKVGSLVKHLQ...,"[1.227272727272727, 0.40909090909090917, 0.409...",151
3,54,EKPCNSNQQPLENLVEDTLINYSQFGSPKDHEHNGCKLCQTDRYCE...,"[0.5454545454545454, 0.18181818181818182, 0.18...",199
4,61,AAKFVFRHNDPDHLEKLLKKSNSETPKIVAFETVHSMDGAICPLEE...,"[1.9090909090909083, 0.6363636363636364, 0.636...",152
...,...,...,...,...
17612,189903,HISGTPHAPIVRRYLSLLDTAVELELPGYQGPRLPRRQKVPIFPQP...,"[1.3636363636363633, 0.45454545454545464, 0.45...",207
17613,189921,LEDRLSPGVLGCSALCRSGVRTKFGINMVTSRERGGHQVASGGVN,"[0.2727272727272727, 0.09090909090909091, 0.09...",45
17614,189935,YASQWFLTLFTAKFPGLNIIFHVALALGKDSAQESVITRDIHRTFP...,"[8.590909090909099, 2.863636363636362, 2.86363...",905
17615,189956,HFYHGEEFNILVHLKFIDGLNLKQSSLVYNISSVYGLKIPYPNPSI,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",47


In [5]:
data[[ "reps"]]

Unnamed: 0,reps
0,"[1.6363636363636358, 0.5454545454545455, 0.545..."
1,"[0.5454545454545454, 0.18181818181818182, 0.18..."
2,"[1.227272727272727, 0.40909090909090917, 0.409..."
3,"[0.5454545454545454, 0.18181818181818182, 0.18..."
4,"[1.9090909090909083, 0.6363636363636364, 0.636..."
...,...
17612,"[1.3636363636363633, 0.45454545454545464, 0.45..."
17613,"[0.2727272727272727, 0.09090909090909091, 0.09..."
17614,"[8.590909090909099, 2.863636363636362, 2.86363..."
17615,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
