## development for multi-consensus seq calculation

In [2]:
import pandas as pd

In [50]:
""" recursive function for growing kmer sequence in N-terminal/5' direction """
def grow_seq_n(seed_kmer, input_df, nterm_kmers=None, length=None, cons_seq_n=None):
    
    # if a list has not been passed as argument create an empty one
    if(nterm_kmers == None):
        nterm_kmers = []
    
    # if an length has not been passed as argument, initialize one from 0
    if (length == None):
        length = 0
            
    # choose next kmer in the N-terminal direction
    if input_df['Node2'].str.contains(seed_kmer).any()\
        and seed_kmer not in nterm_kmers:  # greedy path only, no cycles
        
        length += 1
        nterm_kmers.append(seed_kmer)
        
        poss_paths_nterm = input_df[input_df['Node2'] == seed_kmer]
        #print("possible future directions =",poss_paths_nterm)

        # find edge with highest counts 
        most_common_kmer_n = poss_paths_nterm['Weight'] == poss_paths_nterm['Weight'].max()
        
        # if more than one most common edge, randomly pick one
        next_n_edge_df = poss_paths_nterm[most_common_kmer_n].sample(1)
        next_n_kmer = str(next_n_edge_df.iloc[0]['Node1'])
        
        # recursively add kmers to the consensus sequence
        grow_seq_n(next_n_kmer, input_df, nterm_kmers, length)
        
    # after the kmer finding terminates
    else:
        #nterm_kmers.append(seed_kmer) #for locating where the first repeat is found

        print(f"Found beginning of sequence; total downstream k-mers traveled = {len(nterm_kmers)}")
        return nterm_kmers        

    # calculate the consensus sequence from the most common kmer list
    cons_seq_n = nterm_kmers[0]
    for kmer in nterm_kmers[1:]:
        cons_seq_n = kmer[0]+cons_seq_n       

    # exit function and return consensus n-terminal kmers
    return cons_seq_n

""" function for recursively growing kmer sequence in C-terminal/3' direction """

def grow_seq_c(seed_kmer, input_df, cterm_kmers=None, length=None):

    # if a list has not been passed as argument create an empty one
    if(cterm_kmers == None):
        cterm_kmers = []

    # if an length has not been passed as argument, initialize one from 0
    if (length == None):
        length = 0

    # choose next kmer in the N-terminal direction
    if input_df['Node1'].str.contains(seed_kmer).any()\
        and seed_kmer not in cterm_kmers:  # greedy path only, no cycles

        length += 1
        cterm_kmers.append(seed_kmer)
        poss_paths_cterm = input_df[input_df['Node1'] == seed_kmer]

        # find edge with highest counts
        most_common_kmer_c = poss_paths_cterm['Weight'] == poss_paths_cterm['Weight'].max()

        # if more than one most common edge, randomly pick one
        next_c_edge_df = poss_paths_cterm[most_common_kmer_c].sample(1)
        next_c_kmer = str(next_c_edge_df.iloc[0]['Node2'])

        # recursively add kmers to the consensus sequence
        grow_seq_c(next_c_kmer, input_df, cterm_kmers, length)

    # after the kmer finding terminate
    else:
        #cterm_kmers.append(seed_kmer) #for locating where the first repeat is found                                     
        print(f"Found end of sequence; total upstream k-mers traveled = {len(cterm_kmers)}")
        return cterm_kmers
    
    # calculate the consensus sequence from the most common kmer list
    cons_seq_c = cterm_kmers[0]
    for kmer in cterm_kmers[1:]:
        cons_seq_c = cons_seq_c + kmer[-1]
    
    # exit function and return consensus n-terminal kmers
    return cons_seq_c

In [None]:
""" WRAPPER """
def grow_cons_seq(seed_kmer_1, seed_kmer_2, input_df):
    # grow sequence recursively in both directions
    nterm_seq = grow_seq_n(seed_kmer_1, input_df)
    print(f"Upstream sequence [length={len(nterm_seq)}]:\n{nterm_seq}")
    cterm_seq = grow_seq_c(seed_kmer_2, input_df)
    print(f"Downstream sequence [length={len(cterm_seq)}]:\n{cterm_seq}")
    # format output in the event connecting edges are not found
    try:
        consensus_seq = nterm_seq+cterm_seq
        if len(nterm_seq)==0 and len(cterm_seq)==0:
            print('WARNING: No connecting paths found.')
            consensus_seq = None
    except:
        if len(cterm_seq) < len(seed_kmer_1) and len(nterm_seq) > len(seed_kmer_1):
            consensus_seq = nterm_seq+seed_kmer_2[-1]
        elif len(nterm_seq) < len(seed_kmer_1) and len(cterm_seq) > len(seed_kmer_1):
            consensus_seq = seed_kmer_1[0]+cterm_seq
    if consensus_seq:
        print(f"Consensus [length={len(consensus_seq)}]: {consensus_seq}")
    return consensus_seq

In [20]:
file = '../oligo/results/thresholded/oligo_1_10mer_graph_fdr1e-10_3w.csv'
df = pd.read_csv(file)
cols=['Node1','Node2','Weight']
df.columns = cols
df.sort_values('Weight', ascending=False, inplace=True)
df.reset_index(inplace=True, drop=True)
input_df = df.copy()

In [11]:
ranked_edges = df.nlargest(10, 'Weight')
ranked_edges

Unnamed: 0,Node1,Node2,Weight
0,ACAAAAAAA,CAAAAAAAA,10.978532
1,CAAAAAAAA,AAAAAAAAA,10.577317
2,GTTTTTTTT,TTTTTTTTT,9.499601
3,TGTTTTTTT,GTTTTTTTT,8.540747
4,GGACAAAAA,GACAAAAAA,7.741307
5,CTGTTTTTT,TGTTTTTTT,7.722771
6,ATGAGGTAG,TGAGGTAGG,7.253238
7,CAGTGGGGG,AGTGGGGGT,7.004341
8,GTGAGGTAG,TGAGGTAGG,6.9926
9,TGGGAGGTG,GGGAGGTGG,6.988173


In [51]:
rows = []
records = []
print()
for idx, row in ranked_edges.iterrows():
    rank = idx+1
    score = row['Weight']
    seed_kmer_1 = row['Node1']
    seed_kmer_2 = row['Node2']
    print(f'{seed_kmer_1} {seed_kmer_2}\trank:{rank}\tscore:{score}')
    seed_kmer_seq = seed_kmer_1[:-1]+seed_kmer_2[-1]
    header = f'[seed={seed_kmer_seq} | rank={rank} | score={score}]'
    cons_seq = grow_cons_seq(seed_kmer_1, seed_kmer_2, input_df)
    record = SeqRecord(Seq(cons_seq), id=header)
    records.append(record)
    rows.append([seed_kmer_seq, rank, score, cons_seq])
    print()
df_out = pd.DataFrame(rows, columns=['seed_kmer','rank','score','consensus_seq'])
print(f'Consensus seqs from top {len(ranked_edges)} kmers:')
df_out
#return df_out


ACAAAAAAA CAAAAAAAA	rank:1	score:10.978532268446
Found beginning of sequence; total downstream k-mers traveled = 2
Upstream sequence [length=10]:
GACAAAAAAA
Found end of sequence; total upstream k-mers traveled = 1
Downstream sequence [length=9]:
CAAAAAAAA
Consensus [length=19]: GACAAAAAAACAAAAAAAA

CAAAAAAAA AAAAAAAAA	rank:2	score:10.5773171028366
Found beginning of sequence; total downstream k-mers traveled = 3
Upstream sequence [length=11]:
GACAAAAAAAA
Found end of sequence; total upstream k-mers traveled = 0
Downstream sequence [length=0]:
[]
Consensus [length=12]: GACAAAAAAAAA

GTTTTTTTT TTTTTTTTT	rank:3	score:9.49960115256854
Found beginning of sequence; total downstream k-mers traveled = 4
Upstream sequence [length=12]:
CCTGTTTTTTTT
Found end of sequence; total upstream k-mers traveled = 0
Downstream sequence [length=0]:
[]
Consensus [length=13]: CCTGTTTTTTTTT

TGTTTTTTT GTTTTTTTT	rank:4	score:8.54074713533857
Found beginning of sequence; total downstream k-mers traveled = 3
Up

Unnamed: 0,seed_kmer,rank,score,consensus_seq
0,ACAAAAAAA,1,10.978532,GACAAAAAAACAAAAAAAA
1,CAAAAAAAA,2,10.577317,GACAAAAAAAAA
2,GTTTTTTTT,3,9.499601,CCTGTTTTTTTTT
3,TGTTTTTTT,4,8.540747,CCTGTTTTTTTGTTTTTTTT
4,GGACAAAAA,5,7.741307,GGACAAAAAAAA
5,CTGTTTTTT,6,7.722771,CCTGTTTTTTTGTTTTTTTT
6,ATGAGGTAG,7,7.253238,TATGAGGTAGTGAGGTAGGGCGA
7,CAGTGGGGT,8,7.004341,
8,GTGAGGTAG,9,6.9926,AATGTGAGGTAGTGAGGTAGGGCGA
9,TGGGAGGTG,10,6.988173,AATGGGAGGTGG


In [52]:
infile_base = file.replace('.csv','')
infile_base

'../oligo/results/thresholded/oligo_1_10mer_graph_fdr1e-10_3w'

In [54]:
import os
path, filename = os.path.split(os.path.realpath(file))
infile_base = filename.replace('.csv','')
infile_base

'oligo_1_10mer_graph_fdr1e-10_3w'

In [63]:
header = f'consensus sequence [{infile_base}] [seed={seed_kmer_1[:-1]+seed_kmer_2[-1]}]'

In [64]:
record = SeqRecord(Seq(cons_seq), id=header)

In [65]:
record

SeqRecord(seq=Seq('AATGGGAGGTGG'), id='oligo_1_10mer_graph_fdr1e-10_3w [seed=TGGGAGGTG]', name='<unknown name>', description='<unknown description>', dbxrefs=[])

In [None]:
def write_fasta(seq, header, writefile):
    record = SeqRecord(Seq(seq), id=header)
    with open(writefile, "w") as f:
        SeqIO.write(record, f, "fasta")
    print(f"FASTA record has been written to {writefile}")

# check that graph re-direction is working as intended

In [10]:
# when nodes are in the correct order
n1 = 'ATGCA'
n2 = 'TGCAG'
print(n1[1:], n2[:-1])
print(n1[1:] == n2[:-1])

TGCA TGCA
True


In [13]:
# when nodes are reversed
n1 = 'TGCAG'
n2 = 'ATGCA'
print(n1[1:], n2[:-1])
print(n1[1:] == n2[:-1])

GCAG ATGC
False


In [14]:
# when nodes are in the correct order
n1 = 'ATGCA'
n2 = 'TGCAG'
print(n1[:-1], n2[1:])
print(n1[:-1] == n2[1:])

ATGC GCAG
False


In [15]:
# when nodes are reversed
n1 = 'TGCAG'
n2 = 'ATGCA'
print(n1[:-1], n2[1:])
print(n1[:-1] == n2[1:])

TGCA TGCA
True


#### oh shit.. this doesn't work for highly repetitive sequences

#### solution:

In [41]:
import pandas as pd
from itertools import filterfalse
import numpy as np

In [79]:
def unique_everseen(iterable, key=None):
    "List unique elements, preserving order. Remember all elements ever seen."
    # unique_everseen('AAAABBBCCDAABBB') --> A B C D
    # unique_everseen('ABBCcAD', str.lower) --> A B C D
    seen = set()
    seen_add = seen.add
    if key is None:
        for element in filterfalse(seen.__contains__, iterable):
            seen_add(element)
            yield element
    else:
        for element in iterable:
            k = key(element)
            if k not in seen:
                seen_add(k)
                yield element

def map_kmer(kmer, dct):
    if kmer in dct.keys():
        return(dct[kmer])
    return(np.nan)

In [89]:
df = pd.read_csv('../oligo/debug/kmer_direction_test.csv')
df.columns = map(str.lower, df.columns)
df['tuple'] = df[['node1','node2']].apply(tuple, axis=1)
print('before:')
print(df)
kmer_pairs = df.tuple.to_list()
count_dct = {n:kmer_pairs.count(n) for n in unique_everseen(kmer_pairs)}
id_df = df.groupby('tuple').agg(lambda x: ', '.join(set(x))).drop(columns=['node1','node2'])
id_dct = id_df.to_dict()['proteinid']
#id_dct = id_dct['ProteinID']
df = df.drop('proteinid',axis=1).drop_duplicates()
df['count'] = [map_kmer(kmer, count_dct) for kmer in df.tuple]
df['ids'] = [map_kmer(kmer, id_dct) for kmer in df.tuple]
df = df.drop('tuple', axis=1)
print('after:')
print(df)

before:
        node1      node2 proteinid                   tuple
0   AGAGAGAGA  GAGAGAGAG         A  (AGAGAGAGA, GAGAGAGAG)
1   AGAGAGAGA  GAGAGAGAG         B  (AGAGAGAGA, GAGAGAGAG)
2   AGAGAGAGA  GAGAGAGAG         A  (AGAGAGAGA, GAGAGAGAG)
3   AGAGAGAGA  GAGAGAGAG         C  (AGAGAGAGA, GAGAGAGAG)
4   AGAGAGAGA  GAGAGAGAG         F  (AGAGAGAGA, GAGAGAGAG)
5   ATATATATA  TATATATAT         R  (ATATATATA, TATATATAT)
6   ATATATATA  TATATATAT         A  (ATATATATA, TATATATAT)
7   ATATATATA  TATATATAT         R  (ATATATATA, TATATATAT)
8   GCGCGCGCG  CGCGCGCGC         A  (GCGCGCGCG, CGCGCGCGC)
9   TGTGAGAGG  GTGAGAGGG         G  (TGTGAGAGG, GTGAGAGGG)
10  GTGTGGTGG  TGTGGTGGG         C  (GTGTGGTGG, TGTGGTGGG)
11  GAGAGAGAG  AGAGAGAGA         O  (GAGAGAGAG, AGAGAGAGA)
12  GAGAGAGAG  AGAGAGAGA         X  (GAGAGAGAG, AGAGAGAGA)
13  TTGGAAGAT  TGGAAGATG         D  (TTGGAAGAT, TGGAAGATG)
14  ATTGCGACT  TTGCGACTC         F  (ATTGCGACT, TTGCGACTC)
15  AAGGTAGGT  AGGTAGGTA         V  (AAGGTAGGT, 

In [78]:

id_dict

{('AAGGTAGGT', 'AGGTAGGTA'): 'V',
 ('AGAGAGAGA', 'GAGAGAGAG'): 'C, B, F, A',
 ('ATATATATA', 'TATATATAT'): 'R, A',
 ('ATTGCGACT', 'TTGCGACTC'): 'F',
 ('CTGGGTAGG', 'TGGGTAGGG'): 'F',
 ('GAGAGAGAG', 'AGAGAGAGA'): 'X, O',
 ('GCGCGCGCG', 'CGCGCGCGC'): 'A',
 ('GTGTGGTGG', 'TGTGGTGGG'): 'C',
 ('TAGGTGAGG', 'AGGTGAGGG'): 'Q',
 ('TGTGAGAGG', 'GTGAGAGGG'): 'G',
 ('TTGGAAGAT', 'TGGAAGATG'): 'D'}

In [72]:
id_dict['ProteinID'][('AAGGTAGGT', 'AGGTAGGTA')]

'V'

KeyError: "['ProteinID'] not found in axis"

In [44]:
count_dct

{('AGAGAGAGA', 'GAGAGAGAG'): 5,
 ('ATATATATA', 'TATATATAT'): 3,
 ('GCGCGCGCG', 'CGCGCGCGC'): 1,
 ('TGTGAGAGG', 'GTGAGAGGG'): 1,
 ('GTGTGGTGG', 'TGTGGTGGG'): 1,
 ('GAGAGAGAG', 'AGAGAGAGA'): 2,
 ('TTGGAAGAT', 'TGGAAGATG'): 1,
 ('ATTGCGACT', 'TTGCGACTC'): 1,
 ('AAGGTAGGT', 'AGGTAGGTA'): 1,
 ('TAGGTGAGG', 'AGGTGAGGG'): 1,
 ('CTGGGTAGG', 'TGGGTAGGG'): 1}

In [46]:
test['count'] = 
test

Unnamed: 0,node1,node2,tuple,count
0,AGAGAGAGA,GAGAGAGAG,"(AGAGAGAGA, GAGAGAGAG)",5
1,AGAGAGAGA,GAGAGAGAG,"(AGAGAGAGA, GAGAGAGAG)",5
2,AGAGAGAGA,GAGAGAGAG,"(AGAGAGAGA, GAGAGAGAG)",5
3,AGAGAGAGA,GAGAGAGAG,"(AGAGAGAGA, GAGAGAGAG)",5
4,AGAGAGAGA,GAGAGAGAG,"(AGAGAGAGA, GAGAGAGAG)",5
5,ATATATATA,TATATATAT,"(ATATATATA, TATATATAT)",3
6,ATATATATA,TATATATAT,"(ATATATATA, TATATATAT)",3
7,ATATATATA,TATATATAT,"(ATATATATA, TATATATAT)",3
8,GCGCGCGCG,CGCGCGCGC,"(GCGCGCGCG, CGCGCGCGC)",1
9,TGTGAGAGG,GTGAGAGGG,"(TGTGAGAGG, GTGAGAGGG)",1


# guaymas checks

In [6]:
import os
import pandas as pd
from Bio import SeqIO
import statistics

In [4]:
def print_stats(records):
    sizes = [len(rec) for rec in records]
    print("Mean read length:", statistics.mean(sizes))
    print("Median:", statistics.median(sizes))
    print("Mode:", statistics.mode(sizes))
    print("Max:", max(sizes))
    print("Min:", min(sizes))

In [2]:
fa_ny = '../guaymas/data/NylonHits_Guaymas2020_ALLBINS.fasta'
fa_pet = '../guaymas/data/PETHits_Guaymas2020_ALLBINS.fasta'

In [7]:
recs_ny = list(SeqIO.parse(fa_ny, "fasta"))
recs_pet = list(SeqIO.parse(fa_pet, "fasta"))

In [8]:
print("Total entries in Nylon hits:", len(recs_ny))
print_stats(recs_ny)

Total entries in Nylon hits: 661
Mean read length: 448.8290468986384
Median: 405
Mode: 367
Max: 1032
Min: 115


In [9]:
print("Total entries in PET hits:", len(recs_pet))
print_stats(recs_pet)

Total entries in PET hits: 503
Mean read length: 408.04771371769385
Median: 365
Mode: 253
Max: 1153
Min: 141
