In [141]:
import re
import os
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.stats import zscore
from Bio import SeqIO
from Bio import SeqIO
from Bio import Align
import Levenshtein
import random
aligner = Align.PairwiseAligner(mode='global', match_score=2, mismatch_score=-1)

In [166]:
def get_truth_set(truth_file):
    df = pd.read_csv(truth_file)
    truth_set = [frozenset({i,j}) for i,j in zip(df['Node1'],df['Node2'])]
    truth_set = set(truth_set)
    return(truth_set)

def get_kmers(seq_str, k):
    dbg_list = []
    for i in range(len(seq_str)):
        n1 = seq_str[i:i+(k-1)]
        n2 = seq_str[i+1:i+k]
        if len(n1) and len(n2) == k-1:
            dbg_list.append(frozenset({n1,n2}))
    return(dbg_list)

def count_matches(seq_str, k, truth_set):
    tp = 0
    fp = 0
    dbg_list = get_kmers(seq_str, k)
    for i in dbg_list:
        if i in list(truth_set):
            tp += 1
        else:
            fp += 1
    return(tp, fp)

def get_alignment(true_seq, cons_seq):
    alignment = aligner.align(true_seq, cons_seq)
    return(alignment[0])

def random_seq(len):
    random_sequence = ''.join(random.choice('ACGT') for _ in range(int(len)))
    return(random_sequence)

def parse_cons_seqs(file, exp, kmer_size):
    i = exp
    k = kmer_size
    truth_file = f'../oligo/data/bait_kmers/oligo_{i}_comp_{k}mers.csv'
    truth_seq_file = f'../oligo/data/fastas/oligo_{i}_comp.fasta'
    cons_file = file
    
    df = pd.read_csv(cons_file)

    # replace empty consensus seqs
    df.consensus_seq = np.where(df.consensus_seq.isnull(), df.seed_kmer, df.consensus_seq)
    
    truth_set = get_truth_set(truth_file)
    true_seq = list(SeqIO.parse(truth_seq_file, "fasta"))[0].seq
    true_seq.id = f'oligo {i}'
    df['consensus_seq_len'] = [len(i) for i in df['consensus_seq']]
    df[['true_pos_kmers','false_pos_kmers']] = [count_matches(str(i), k, truth_set) for i in df['consensus_seq']]
    df['tpr'] = df['true_pos_kmers']/(df['true_pos_kmers']+df['false_pos_kmers'])
    df['alignment'] = [get_alignment(true_seq, str(i)) for i in df['consensus_seq']]
    df['levenshtein_ratio'] = [Levenshtein.ratio(str(true_seq), str(i)) for i in df['consensus_seq']]
    df['random_levenshtein_ratio'] = [Levenshtein.ratio(str(true_seq), random_seq(i)) for i in df['consensus_seq_len']]
    df['k'] = k
    df['exp'] = f'oligo {i}'
    return(df)

In [167]:
oligos = [1,2,3]
kmers = [10]
conditions = ['fdr1e-10', 'fdr1e-10_3log2fc', 'fdr1e-10_sd', 'fdr1e-10_3log2fc_sd']
for i in oligos:
    for k in kmers:
        for c in conditions:
            cons_file = f'../oligo/results/reconstructions/oligo_{i}_10mer_graph_{c}_consensus.csv'
            df_out = parse_cons_seqs(file=cons_file, exp=i, kmer_size=k)
            outfile_name = f'oligo_{i}_{k}mer_top100_consensus_alignment_{c}.csv'
            outdir = '../oligo/results/reconstructions'
            df_out.to_csv(f'{outdir}/{outfile_name}', index=False)

In [134]:
df_out

Unnamed: 0,seed_kmer,rank,score,consensus_seq,consensus_seq_len,true_pos_kmers,false_pos_kmers,tpr,alignment,levenshtein_ratio,k,exp
0,AACAAAAAA,1,11.505496,AACAAAAAA,9,0,0,,"(AGAACTTACATCAACTAAACAACAAATGAACAAAAAAAAAA, A-...",0.360000,10,oligo 3
1,AACAAAAAA,2,11.447696,AACAAAAAA,9,0,0,,"(AGAACTTACATCAACTAAACAACAAATGAACAAAAAAAAAA, A-...",0.360000,10,oligo 3
2,AACAAAAAA,3,11.425258,AACAAAAAA,9,0,0,,"(AGAACTTACATCAACTAAACAACAAATGAACAAAAAAAAAA, A-...",0.360000,10,oligo 3
3,AGAGAGAGG,4,11.215166,AGAGAGAGG,9,0,0,,"(AGA-ACTTACATCAACTAAACAACAAATGAACAAAAAAAAAA--,...",0.240000,10,oligo 3
4,AGAGAGAGG,5,10.935673,AGAGAGAGG,9,0,0,,"(AGA-ACTTACATCAACTAAACAACAAATGAACAAAAAAAAAA--,...",0.240000,10,oligo 3
...,...,...,...,...,...,...,...,...,...,...,...,...
95,TGATGTAAT,96,3.777800,AGTTGATGTAAGGATGTAAGTGC,23,0,14,0.0,(AGAACTT-ACATCAAC-TAA--ACAACAAATG-AA---CAAAAAA...,0.468750,10,oligo 3
96,TGTAAGTTT,97,3.731308,AGTTGATGTAAGTTC,15,0,6,0.0,(AGAACTT-ACATCAAC-TAAACAACAAATGAA--CAAAAAAAAAA...,0.392857,10,oligo 3
97,AGGTGATGA,98,3.728734,AGGTGATGA,9,0,0,,"(AGAAC-TT-ACATCAACTAAACAACAAATGAACAAAAAAAAAA, ...",0.280000,10,oligo 3
98,GATGGAAGT,99,3.678269,GATGGAAGTATGGAAGTT,18,0,9,0.0,(AGAACTT--ACA-TCAACTAAACAACAAATG-AACAAAAAAAAAA...,0.372881,10,oligo 3


In [152]:
import random
import statistics
seq_lens = df_out['consensus_seq_len'].to_list()
median = statistics.median(seq_lens)
median

48.0

In [140]:
random_sequence = ''.join(random.choice('ACGT') for _ in range(int(median)))
random_sequence

'GTCGCCATGACAGGGGTCATCTCTATGTTCCCAGCTTC'