In [1]:
import egglib
import glob
import pandas as pd

In [2]:
# Define operator to calculate segregating sites

cs = egglib.stats.ComputeStats()
cs.add_stats('S')
alphabet = egglib.Alphabet('char', ['a', 't', 'c', 'g', '-', 'N'], [], case_insensitive = True)

In [3]:
# Calculate the number of segregating sites for all opa alignments
filenames = glob.glob('../../../data/kit14/subsampled_nanopore_reads/opa_sequences_aligned/*.aln')

opa_names = []
num_different = []
for filename in filenames:
    opa_name = filename.split('/')[-1][:-4]
    aln1 = egglib.io.from_fasta(filename, labels=True, alphabet = alphabet)#egglib.alphabets.DNA)
    struct = egglib.struct_from_labels(aln1, lvl_pop=0, lvl_indiv=1)
    stats = cs.process_align(aln1, max_missing = 1)
    
    opa_names.append(opa_name)
    num_different.append(stats['S'])
    
df = pd.DataFrame({'opa_name':opa_names, 'segregating_sites':num_different})

In [4]:
# These are the opas with non-zero segregating sites
df[df['segregating_sites']>0].sort_values('opa_name')

Unnamed: 0,opa_name,segregating_sites
56,DGI_65_opa_7,1
39,EEE029_opa_11,1
18,EEE029a_VI_opa_10,1
9,EEE029a_VI_opa_2,1
58,EEE029a_VI_opa_4,2
19,EEE029b_VI_opa_2,1
40,EEE029b_VI_opa_3,2
41,EEE029b_VI_opa_5,4


* DGI_65_opa_7: 1 SNP in polypolish and pypolca polished assemblies (a>g) at 15% read sampling - same SNP appears with higher coverage
* EEE029_15_opa_11: 1 deletion in polypolish and pypolca polished assemblies in cgg/ccg repeats (c>-) at 15% read sampling - same change appears with higher coverage
* EEE029a_VI_opa_10: 1 insertion in pypolca polished assembly in cgg/ccg repeats (->c) at 15% read sampling - same SNP appears with higher coverage
* EEE029a_VI_opa_2: 1 SNP in polypolish and polca polished assemblies in cgg/ccg repeats (t>c) at 15% read sampling - same change appears with higher coverage
* EEE029a_VI_opa_4: 1 deletion in medaka and polypolish polished assemblies (c>-) at 15% read sampling - not present at higher coverage; 1 deletion in medaka and polypolish polished assemblies (c>-) at 60% read sampling - not present at other coverage
* EEE029b_VI_opa_2: 1 SNP in polypolish and polca polished assemblies (a>c) in cgg/ccg repeats at 15% and 30% coverage - same SNP appears with higher coverage
* EEE029b_VI_opa_3: 1 SNP in medaka polished assembly (g>t) at 15% coverage - same SNP appears with higher coverage; 1 deletion in medaka and polypolish polished assemblies (c>-) at 60% coverage - does not appear in other coverages
* EEE029b_VI_opa_5: SNPs and/or indels (4 total) in flye and medaka assemblies at 15% and 30% coverage in cgg/ccg repeats




Not fixed in autocycler assemblies
- EEE029b_VI_opa_2