In [5]:
import pathlib as pl
import operator as op
import collections as col

import yaml
import pandas as pd

project_folder = pl.Path('/home/local/work/code/github/project-diploid-assembly')

sample_source_tag_yaml = project_folder / pl.Path('annotation/samples/sample_tags.yml')

record_sample_info = ['individual', 'sex', 'super_population', 'population']

folder_tags = {
    'v14_folder': [('PGAS', 'v14-dev'), ('project_context', '2022-ongoing')],
    'v13_folder': [('PGAS', 'v13-dev'), ('project_context', 'pangenie.inversions')],
    'v12_folder': [('PGAS', 'v12'), ('project_context', 'Science2021')],
}

folders = {
    'v14_folder': project_folder / pl.Path('smk_config/samples/v14'),
    'v13_folder': project_folder / pl.Path('smk_config/samples/hifi_v13'),
    'v12_folder': project_folder / pl.Path('smk_config/samples/hgsvc'),
}

# this is a manual copy of the "Yr2" HGSVC samples to cross-check
# with the active sample table created below

hgsvc_yr2 = """
HG01352
HG02059
NA19434
HG04217
HG03807
NA19836
HG02106
HG00268
GM20355
GM19320
GM19129
HG02769
HG03452
HG03520
HG02282
HG02554
HG02953
GM21487
NA18989
NA19331
HG02666
NA19317
NA19347
HG03248
HG04036
HG01457
NA19384
"""

hgsvc_yr2 = hgsvc_yr2.strip().split()
hgsvc_yr2 = set([s.replace('GM', 'NA') for s in hgsvc_yr2])


def extract_sample_info(yaml_file, sample_tags, folder_tags, no_source_folder):

    get_sample_info = op.itemgetter(*tuple(record_sample_info))
    
    sample_info = dict()
    
    with open(yaml_file, 'rb') as config:
        sample_metadata = yaml.load(config, Loader=yaml.SafeLoader)
        sample_name = yaml_file.name.split('.')[0].rsplit('_', 1)[-1].upper()
        sample_targets = sample_metadata[f'sample_targets_{sample_name}']
        sample_desc = sample_metadata[f'sample_description_{sample_name}']

    sample_info.update(dict((k, v) for k, v in zip(record_sample_info, get_sample_info(sample_desc))))
    sample_info['is_skipped'] = sample_targets[0].get('ignore', False)
    
    hifi_source = 'N/A'
    has_hifi = 'no-hifi'
    clr_source = 'N/A'
    has_clr = 'no-clr'
    sseq_source = 'N/A'
    
    for rs in sample_metadata[f'sample_description_{sample_name}']['data_sources']:
        for k, v in rs.items():
            if k not in ['long_reads', 'strandseq']:
                continue
            if k == 'long_reads' and v['technology'].lower() != 'pacbio':
                continue
            if k == 'long_reads':
                lr_source = 'N/A'
                if no_source_folder:
                    pass
                else:
                    lr_source = v['data_source_folder'].replace('/gpfs/project/projects/medbioinf', '')
                if lr_source.lower() == 'empty_path':
                    lr_source = 'N/A'
                readset_id = v['readset']
                if 'pbsq1' in readset_id:
                    continue
                if 'pbsq2-ccs' in readset_id:
                    has_hifi = 'yes-HIFI'
                    hifi_source = lr_source
                elif 'pbsq2-clr' in readset_id:
                    has_clr = 'yes-CLR'
                    clr_source = lr_source
                else:
                    raise ValueError(f'read type unknown: {readset_id}')
            elif k == 'strandseq':
                if no_source_folder:
                    pass
                else:
                    sseq_source = v['data_source_folder'].replace('/gpfs/project/projects/medbioinf', '')
                if sseq_source.lower() == 'empty_path':
                    sseq_source = 'N/A'
            else:
                raise ValueError(f'{k} --- {v}')

    sample_info['HIFI_data'] = has_hifi
    sample_info['HIFI_path'] = hifi_source
    sample_info['CLR_data'] = has_clr
    sample_info['CLR_path'] = clr_source
    sample_info['SSEQ_path'] = sseq_source
    
    for tag_name, tag_value in folder_tags:
        sample_info[tag_name] = tag_value
    for tag_name, tag_value in sample_tags[sample_name]:
        sample_info[tag_name] = tag_value
    
    if not sample_tags[sample_name]:
        if sample_name.startswith('HC'):
            sample_info['data_source'] = 'hcHGSVC'
        else:
            sample_info['data_source'] = 'HGSVC'
    
    return sample_info


def load_sample_source_tags(yaml_config):

    sample_to_tags = col.defaultdict(list)
    
    get_tag_and_samples = op.itemgetter(*('tag', 'samples'))
    
    with open(yaml_config, 'rb') as config:
        dump = yaml.load(config, Loader=yaml.SafeLoader)
        forbidden_samples = sorted(dump['sample_excludes'].keys())
        sample_source_tags = dump['data_source_tags']
        for sst in sample_source_tags:
            tag, samples = get_tag_and_samples(sst)
            for s in samples:
                sample_to_tags[s].append(('data_source', tag))

    return forbidden_samples, sample_to_tags

forbidden_samples, sample_source_tags = load_sample_source_tags(sample_source_tag_yaml)

column_sort_order = [
    'individual',
    'sex',
    'super_population',
    'population',
    'project_context',
    'data_source',
    'HIFI_data',
    'CLR_data',
    'is_skipped',
    'PGAS',
    'HIFI_path',
    'CLR_path',
    'SSEQ_path'
]

for folder_id, folder_path in folders.items():
    context_tags = folder_tags[folder_id]

    sample_configs = folder_path.glob('**/*.yml')
    sample_records = []
    for s in sample_configs:
        if any(x in s.name.upper() for x in forbidden_samples):
            continue
        if folder_id == 'v12_folder':
            nsf = True
        else:
            nsf = False
        row = extract_sample_info(s, sample_source_tags, context_tags, nsf)
        sample_records.append(row)

    df = pd.DataFrame.from_records(
        sample_records,
    )

    df.sort_values(['super_population', 'population', 'individual'], inplace=True)
    df = df[column_sort_order]
    if folder_id == 'v14_folder':
        collected_samples = set(df['individual'])
        missing = hgsvc_yr2 - collected_samples
        if missing:
            raise ValueError(f'Yr2 missing: {sorted(missing)}')
    out_table = folder_path / pl.Path('active_samples.tsv')
    df.to_csv(out_table, header=True, index=False, sep='\t')
    print(out_table)
    print(df.head())
    print('====================')


/home/local/work/code/github/project-diploid-assembly/smk_config/samples/v14/active_samples.tsv
   individual     sex super_population population project_context data_source  \
18    HG01891  female              AFR        ACB    2022-ongoing        HPRC   
22    HG02257  female              AFR        ACB    2022-ongoing        HPRC   
21    HG02282  female              AFR        ACB    2022-ongoing       HGSVC   
20    HG02486    male              AFR        ACB    2022-ongoing        HPRC   
17    HG02554    male              AFR        ACB    2022-ongoing       HGSVC   

   HIFI_data CLR_data  is_skipped     PGAS                 HIFI_path CLR_path  \
18  yes-HIFI   no-clr       False  v14-dev   /data/hprc_hifi/HG01891      N/A   
22  yes-HIFI   no-clr       False  v14-dev   /data/hprc_hifi/HG02257      N/A   
21  yes-HIFI   no-clr       False  v14-dev  /data/hgsvc_hifi/HG02282      N/A   
20  yes-HIFI   no-clr       False  v14-dev   /data/hprc_hifi/HG02486      N/A   
17  yes-HIFI