In [12]:
import egglib
import glob
import pandas as pd
import os

In [2]:
# Define operator to calculate segregating sites

cs = egglib.stats.ComputeStats()
cs.add_stats('S')
alphabet = egglib.Alphabet('char', ['a', 't', 'c', 'g', '-', 'N'], [], case_insensitive = True)

In [3]:
# Calculate the number of segregating sites for all opa alignments
filenames = glob.glob('../../data/opa_sequences_aligned/*.aln')

opa_names = []
num_different = []
for filename in filenames:
    opa_name = filename.split('/')[-1][:-4]
    aln1 = egglib.io.from_fasta(filename, labels=True, alphabet = alphabet)#egglib.alphabets.DNA)
    struct = egglib.struct_from_labels(aln1, lvl_pop=0, lvl_indiv=1)
    stats = cs.process_align(aln1, max_missing = 1)
    
    opa_names.append(opa_name)
    num_different.append(stats['S'])
    
df = pd.DataFrame({'opa_name':opa_names, 'segregating_sites':num_different})

In [None]:
# These are the opas with non-zero segregating sites
df[df['segregating_sites']>0].sort_values('opa_name')

Unnamed: 0,opa_name,segregating_sites
19,EEE023_opa_4,1
79,FFF007_opa_11,24
118,GCGS0313_opa_1,1
95,GCGS0313_opa_5,1


In [39]:
# Check if there were any assemblies where an opa was missing.
filename = '../../data/opa_locations/UMASS-DGI_65_50x_1.csv'
filenames = glob.glob('../../data/opa_locations/*.csv')

opa_metadata = pd.DataFrame()
for filename in filenames:
    basename = os.path.basename(filename)[:-4]
    basename_split = basename.split('_')
    read_depth = int(basename_split[-2][:-1])
    read_set = int(basename_split[-1])
    strain = '_'.join(basename_split[:-2])
    
    df = pd.read_csv(filename, index_col = 0)
    df['strain'] = strain
    df['depth'] = read_depth
    df['set'] = read_set
    df['id'] = strain + '_opa_' + df['id'].str.split('_opa_', expand = True)[1]
    
    opa_metadata = pd.concat([opa_metadata, df], ignore_index = True)
    
opa_metadata.sort_values(['strain', 'depth', 'set'], inplace = True, ignore_index = True)

In [40]:
opa_metadata

Unnamed: 0,chromosome,strand,start_cr,stop_cr,start_term,stop_term,start,stop,n_terminus,id,in_frame,strain,depth,set
0,1,-1,69624.0,69664.0,68898.0,68916.0,68899.0,69689.0,69610.0,CCC020_opa_1,0.0,CCC020,50,1
1,1,-1,75119.0,75189.0,74396.0,74414.0,74397.0,75214.0,75105.0,CCC020_opa_2,0.0,CCC020,50,1
2,1,1,983486.0,983496.0,984225.0,984243.0,983463.0,984242.0,983510.0,CCC020_opa_3,0.0,CCC020,50,1
3,1,-1,1061455.0,1061525.0,1060705.0,1060723.0,1060706.0,1061548.0,1061441.0,CCC020_opa_4,0.0,CCC020,50,1
4,1,1,1102178.0,1102198.0,1102924.0,1102942.0,1102153.0,1102941.0,1102212.0,CCC020_opa_5,0.0,CCC020,50,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4755,1,1,541991.0,542081.0,542792.0,542810.0,541966.0,542809.0,542095.0,UMASS-DGI_65_opa_6,1.0,UMASS-DGI_65,125,10
4756,1,-1,620319.0,620364.0,619575.0,619593.0,619576.0,620390.0,620305.0,UMASS-DGI_65_opa_7,0.0,UMASS-DGI_65,125,10
4757,1,1,1533651.0,1533734.0,1534442.0,1534460.0,1533626.0,1534459.0,1533748.0,UMASS-DGI_65_opa_8,0.0,UMASS-DGI_65,125,10
4758,1,1,1583183.0,1583198.0,1583924.0,1583942.0,1583160.0,1583941.0,1583212.0,UMASS-DGI_65_opa_9,0.0,UMASS-DGI_65,125,10
