In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from multiprocessing import Pool
from Bio import SeqIO
from Bio import motifs
from Bio.Seq import Seq
import os 

In [2]:
from os import getpid
from functools import partial

num_partitions = 24 #number of partitions to split dataframe 22
num_cores = 24 #number of cores on your machine 22

def convert_amp(df):
    process_id =str(getpid())
    path = "/mnt/vdb/DECockroach/pws/cd100/"
    pro_plk= path + "chunk"+process_id+".plk"
    if not os.path.exists(path):
        os.makedirs(path)
    
    print(process_id+" Start")

    char_code_list = [char for char in "ACDEFGHIKLMNPQRSTVWY"] 

    ready_df = pd.DataFrame(columns = ['ID', 'Sequence','reps', 'length'])
    for seq_record in df.iterrows():
        #print(seq_record)
        m = motifs.create([ seq_record[1]["Sequence"] ] , alphabet="ACDEFGHIKLMNPQRSTVWY")  
        index_pattern = [char for char in m.consensus]
        pwm = m.counts.normalize(pseudocounts=0.5)
        pssm_df = pd.DataFrame.from_dict(pwm)
        pssm_df.index = index_pattern
        result_df = pd.DataFrame(0, columns=['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'],
        index = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'])
        for  char in char_code_list: # column
            for index, row in pssm_df.iterrows():
                result_df.loc[index, char] += row[char]
        _flat = result_df.values.flatten()
 
        ready_df = ready_df.append({'ID': seq_record[1]["ID"], 'Sequence': seq_record[1]["Sequence"],
                                   'reps': _flat, 'length': seq_record[1]["length"]}, ignore_index=True) 
    print(process_id+" Finish")
    output = open(pro_plk, 'wb')
    pickle.dump(ready_df, output)
    output.close()

def parallelize_dataframe(df, func):
    _tmp_df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    pool.map(func, _tmp_df_split)

    pool.close()
    pool.join()

In [3]:
def get_df(fastas):
    with open(fastas) as fasta_file:  # Will close handle cleanly
        identifiers = []
        lengths = []
        seqs = []
        for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
            identifiers.append(seq_record.id)
            # Remove leading and trailing characters from a string
            seqs.append(str(seq_record.seq.strip('*')))
            lengths.append(len(seq_record.seq))
    # dictionary of lists  
    dict = {'ID': identifiers, 'Sequence': seqs, 'length': lengths}  
    df = pd.DataFrame(dict) 
    return df

In [None]:
# CD100

In [6]:

df= get_df("/home/ubuntu/data/bk_fasta/SRR1552488.assembly.len15.cd100.fasta")

In [7]:
parallelize_dataframe(df, convert_amp)

6927 Start
6928 Start
6929 Start
6930 Start
6931 Start
6932 Start
6933 Start
6934 Start
6935 Start
6937 Start
6938 Start
6939 Start
6940 Start
6941 Start
6942 Start
6943 Start
6944 Start
6945 Start
6946 Start
6947 Start
6948 Start
6949 Start6950 Start
6951 Start

6944 Finish
6945 Finish
6943 Finish
6939 Finish
6941 Finish
6935 Finish
6938 Finish
6937 Finish
6940 Finish
6946 Finish
6942 Finish
6934 Finish
6933 Finish
6948 Finish
6931 Finish
6947 Finish
6932 Finish
6949 Finish
6930 Finish
6929 Finish
6950 Finish
6928 Finish
6927 Finish
6951 Finish


In [None]:
# Transpi

In [4]:
df_tranpi = get_df("/mnt/vdb/DECockroach/transpi/SRR1552488.combined.okay.fa.transdecoder.pep")
NON_CODE = "B|Z|J|U|O|X"
# remove ambigous seqeunce and non-canonical amino acids codes
df_tranpi = df_tranpi[~df_tranpi["Sequence"].str.contains(NON_CODE, regex=True)]
df_tranpi

Unnamed: 0,ID,Sequence,length
0,SOAP.k25.C372231.p1,VYYRRDGKGDKEYWTCQKKPECKATAITIRTGDTVTILKESDHWHA...,102
1,SOAP.k25.C373809.p1,KTRLTVVGTKVINEKNNVKLKGVSKVVSLHVYRLAPDTTIEELTEY...,104
2,SOAP.k25.C379695.p1,MLRDYREIGNLVLCFDTPFTVDFKVIQDAALQKELIEFRCDRRLRE...,95
3,SOAP.k25.C382451.p1,PSPCGANAVCREQNGAGSCTCLPDYVGNPYEGCRPECVLNTDCPSN...,116
4,SOAP.k25.C383053.p1,FKMLTMPRRDICQIETLNLADPLMFLVRNRVCTSTMFHLLPFSYTS...,104
...,...,...,...
17554,Velvet.k37.NODE_9552_length_1035_cov_12.333333.p1,EPKLVNEVNLTFHEKDGEEFMALDKNLKVTTTVKRVYMHLTNLFNG...,102
17555,Velvet.k37.NODE_9713_length_2207_cov_17.057997.p1,MIRRWWKLFMFIMAMLLDVREAFYVPGVAPVEFRKGARIDVKAVKM...,627
17556,Velvet.k37.NODE_9748_length_894_cov_40.512302.p1,MLNFSHHVSKTIRKKKSSKITGRFSRYKKMRTCSSLNEIYIVYIYI...,124
17557,Velvet.k37.NODE_9800_length_726_cov_45.530304.p1,MASFEQAIQQNVMQVAKKVEEHLDAELEKLEKLDSDDLDKLREKRL...,221


In [5]:
parallelize_dataframe(df_tranpi, convert_amp)

6631 Start
6632 Start6633 Start6634 Start6635 Start

6636 Start

6638 Start6639 Start6640 Start6637 Start




6646 Start
6647 Start6645 Start6648 Start
6649 Start6651 Start
6642 Start6650 Start


6643 Start
6641 Start6654 Start6653 Start



6644 Start

6652 Start
6639 Finish
6637 Finish
6638 Finish
6642 Finish
6654 Finish
6652 Finish
6635 Finish
6653 Finish
6636 Finish
6643 Finish
6651 Finish
6650 Finish
6634 Finish
6644 Finish
6649 Finish
6631 Finish
6646 Finish
6645 Finish
6648 Finish
6633 Finish
6647 Finish
6641 Finish
6632 Finish
6640 Finish
