In [1]:

import pathlib as pl
import pandas as pd

path = '/home/local/work/code/github/project-diploid-assembly/annotation/samples/1000g'

out_path = '/home/local/work/code/github/project-diploid-assembly/annotation/samples'

sample_info_1kgp = '20130606_sample_info.sheet_sample-info.tsv'
pop_info_1kgp = '20210728_igsr_populations.tsv'

df_sample = pd.read_csv(
    pl.Path(path, sample_info_1kgp),
    sep='\t',
    header=0
)

df_pop = pd.read_csv(
    pl.Path(path, pop_info_1kgp),
    sep='\t',
    header=0
)

spop_map = dict(t for t in df_pop[['Population code', 'Superpopulation code']].itertuples(index=False))
color_map = dict(
    (p, c.upper()) for p, c in df_pop[['Superpopulation code', 'Superpopulation display colour']].itertuples(index=False)
)

df_sample['Relationship'] = df_sample['Relationship'].str.lower()

#print(df_sample['Relationship'].value_counts(dropna=False))

def norm_relationship(rel):
    if pd.isnull(rel):
        return 'unspecified'
    mapping = {
        'mother': 'mother',
        'father': 'father',
        'child': 'child',
        'unrel': 'unrelated',
        'unrels': 'unrelated',
        'mat grandmother': 'mat_grandmother',
        'mat grandfather': 'mat_grandfather',
        'pat grandmother': 'pat_grandmother',
        'pat grandfather': 'pat_grandfather',
        'maternal grandmother': 'mat_grandmother',
        'paternal grandmother': 'pat_grandmother',
        'mother; child': 'mother',
        'father; child': 'father',
        'pat grandfather; father': 'pat_grandfather',
        'pat grandmother; mother': 'pat_grandmother',
        'mat grandfather; father': 'mat_grandfather',
        'mat grandmother; mother': 'mat_grandmother',
        'child2': 'child',
        'not father': 'unrelated',
        'daughter': 'child',
    }
    norm_rel = mapping.get(rel, None)
    if norm_rel is not None:
        return norm_rel
    if 'child of' in rel:
        return 'child'
    if 'paternal father' == rel:
        return 'pat_grandfather'
    if 'paternal brother' == rel:
        return 'related'
    if rel in ['husband of child', 'wife of child']:
        return 'unrelated'
    raise ValueError(rel)

df_sample['Relationship'] = df_sample['Relationship'].map(norm_relationship)
#print(df_sample['Relationship'].value_counts())

keep_columns = [
    'sample',
    'sex',
    'family_id',
    'relationship',
    'population',
    'super_population',
    'color_hex_superpop',
    'population_description',
]

df_sample['sample'] = df_sample['Sample']
df_sample['sex'] = df_sample['Gender']
df_sample['family_id'] = df_sample['Family ID']
df_sample['relationship'] = df_sample['Relationship']
df_sample['population'] = df_sample['Population']
df_sample['super_population'] = df_sample['population'].map(lambda x: spop_map[x])
df_sample['color_hex_superpop'] = df_sample['super_population'].map(lambda x: color_map[x])
df_sample['population_description'] = '"' + df_sample['Population Description'] + '"'

df_sample = df_sample[keep_columns].copy()

# manually add GIAB trio
# family ID taken from Coriell
giab_desc = '"Eastern European Ashkenazi Jewish Ancestry"'
df_giab = pd.DataFrame.from_records(
    [
        ('NA24385', 'male', '3140', 'child', 'ASK', 'EUR', color_map['EUR'], giab_desc),
        ('NA24149', 'male', '3140', 'father', 'ASK', 'EUR', color_map['EUR'], giab_desc),
        ('NA24143', 'male', '3140', 'mother', 'ASK', 'EUR', color_map['EUR'], giab_desc),
    ],
    columns=keep_columns
)

df_sample = pd.concat([df_sample, df_giab], axis=0)
df_sample.sort_values(['super_population', 'population', 'sample'], ascending=True, inplace=True)

import csv 

df_sample.to_csv(
    pl.Path(out_path, 'samples.tsv'),
    sep='\t',
    index=False,
    header=True,
    quoting=csv.QUOTE_NONE
)