In [1]:
import pandas as pd
from Bio import SeqIO

In [2]:
def get_seq_lens(fasta_file):
    recs = list(SeqIO.parse(fasta_file, "fasta"))
    len_dct = dict()
    probs = []
    for i in range(len(recs)):
        seq_id = recs[i].id
        seq_len = len(recs[i].seq)
        if seq_id not in len_dct.keys():
            len_dct[seq_id] = seq_len
        else:
            probs.append(seq_id)
    print(f'Problem children: {probs}')
    return(len_dct)

def sum_df(file):
    df = pd.read_csv(file)
    count_dct = dict()
    for row in range(len(df)):
        id_csv = df['proteinid'][row]
        motif_count = df['count'][row]
        id_lst = id_csv.split(', ')
        for id in id_lst:
            if id not in count_dct:
                count_dct[id] = motif_count
            else:
                count_dct[id] += motif_count
    df_sum = pd.DataFrame.from_dict(count_dct, orient='index', columns=['edge_sum'])
    df_sum = df_sum.reset_index().rename(columns={'index':'protein_id'}).sort_values('edge_sum', ascending=False)
    return(df_sum)

def match_attr(key, dct):
    attr = dct[key]
    return(attr)

In [3]:
fasta_file = '../guaymas/data/phylo_stratified/deBruijn_g0.fasta'
infile = '../guaymas/results/phylo_stratified/nylonase_g0_7mers_unique_ids.csv'
outfile = '../guaymas/results/phylo_stratified/nylonase_g0_7mers_ids_edge_counts.csv'

In [4]:
len_dct = get_seq_lens(fasta_file)
df = sum_df(infile)
df['seq_len'] = [match_attr(i, len_dct) for i in df['protein_id']]
df['norm_edge_score'] = df['edge_sum']/df['seq_len']
df.sort_values('norm_edge_score', ascending=False)

Problem children: []


Unnamed: 0,protein_id,edge_sum,seq_len,norm_edge_score
13,D4991_C11_H1_Bin_168_scaffold_47485_2,515,435,1.183908
22,D4998_C2223_H2_Bin_243_scaffold_398291_3,307,264,1.162879
8,D4998_C2223_H2_Bin_268_scaffold_277080_1,488,420,1.161905
19,D4998_C2223_H2_Bin_55_scaffold_26178_20,500,433,1.154734
21,D4991_C11_H1_Bin_150_scaffold_61797_66,486,421,1.154394
28,D4998_C2223_H2_Bin_393_scaffold_185503_2,473,412,1.148058
29,D4998_C1112_H1_Bin_326_scaffold_125082_3,472,412,1.145631
1,D4998_C2223_H2_Bin_55_scaffold_442237_2,496,433,1.145497
3,D4994_C39_H1_Bin_779_scaffold_91723_2,434,379,1.145119
12,D4994_C39_H1_Bin_393_scaffold_437666_6,190,166,1.144578


In [5]:
for i in range(0,5):
    fasta_file = f'../guaymas/data/phylo_stratified/deBruijn_g{i}.fasta'
    infile = f'../guaymas/results/phylo_stratified/nylonase_g{i}_7mers_unique_ids.csv'
    outfile = f'../guaymas/results/phylo_stratified/nylonase_g{i}_7mers_ids_edge_counts.csv'
    len_dct = get_seq_lens(fasta_file)
    df = sum_df(infile)
    df['seq_len'] = [match_attr(i, len_dct) for i in df['protein_id']]
    df['norm_edge_score'] = df['edge_sum']/df['seq_len']
    df = df.sort_values('norm_edge_score', ascending=False)
    df.to_csv(outfile, index=False)

Problem children: []
Problem children: []
Problem children: []
Problem children: []
Problem children: []
