In [8]:
import os

import yaml
import pandas as pd

base_path = '/home/local/work/code/github/project-diploid-assembly/smk_config/samples'

ignore_samples = set([
    'NA19434',
    'HG03721',
    'HG01573',
    'HG02018',
    'NA19036',
    'NA19320'
])

samples = []
for root, dirs, files in os.walk(base_path):
    yaml_configs = [f for f in files if f.endswith('.yml') or f.endswith('.yaml')]
    for cfg in yaml_configs:
        with open(os.path.join(root, cfg), 'r') as dump:
            metadata = yaml.safe_load(dump)
            is_sample = [k for k in metadata.keys() if k.startswith('sample_description')]
            if not is_sample:
                continue
            metadata = metadata[is_sample.pop()]
            metadata['HiFi'] = 0
            metadata['CLR'] = 0
            metadata['2020_SKIP'] = 1 if metadata['individual'] in ignore_samples else 0
            for ds in metadata['data_sources']:
                if 'long_reads' not in ds:
                    continue
                attributes = ds['long_reads']
                if 'pbsq2' not in attributes['readset']:
                    continue
                if '-ccs' in attributes['readset']:
                    metadata['HiFi'] = 1
                    continue
                if '-clr' in attributes['readset']:
                    metadata['CLR'] = 1
                    continue
            del metadata['data_sources']
            samples.append(metadata)

sample_table = pd.DataFrame(samples)
sample_table = sample_table[[
    'individual',
    'sex',
    'super_population',
    'population',
    'family',
    'member',
    'HiFi',
    'CLR',
    '2020_SKIP'
]]
out_path = '/home/local/work/code/github/project-diploid-assembly/annotation/sample_table.tsv'
sample_table.sort_values(['super_population', 'population', 'individual'], inplace=True)
sample_table.to_csv(out_path, sep='\t', header=True, index=False)