In [1]:
!ls -lh scratch/

total 466M
lrwxrwxrwx 1 jk21 team118   54 Jan 11 14:45 raw -> /lustre/scratch115/projects/ancientgen/dg11/data_final
-rw-r--r-- 1 jk21 team118 465M Jan 11 14:46 subsample.frq.strat
-rw-r--r-- 1 jk21 team118  32K Jan 11 14:41 subsample-individuals.txt
-rw-r--r-- 1 jk21 team118 1.2K Jan 11 14:46 subsample.log
-rw-r--r-- 1 jk21 team118 2.6K Jan 11 14:46 subsample.nosex


In [2]:
%%bash
plink --bfile scratch/raw/newhumori_ancSA_lazar_norel_crgr10_Tvonly \
    --keep scratch/subsample-individuals.txt \
    --within scratch/subsample-individuals.txt \
    --freq --out scratch/subsample --allow-no-sex

PLINK v1.90b3.44 64-bit (17 Nov 2016)      https://www.cog-genomics.org/plink2
(C) 2005-2016 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to scratch/subsample.log.
Options in effect:
  --allow-no-sex
  --bfile scratch/raw/newhumori_ancSA_lazar_norel_crgr10_Tvonly
  --freq
  --keep scratch/subsample-individuals.txt
  --out scratch/subsample
  --within scratch/subsample-individuals.txt

257853 MB RAM detected; reserving 128926 MB for main workspace.
Allocated 4083 MB successfully, after larger attempt(s) failed.
96623 variants loaded from .bim file.
2363 people (1489 males, 718 females, 156 ambiguous) loaded from .fam.
Ambiguous sex IDs written to scratch/subsample.nosex .
42 phenotype values loaded from .fam.
--keep: 1530 people remaining.
--within: 78 clusters loaded, covering a total of 1530 people.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 1530 founders and 0 nonfounders present.
Calculating allele frequencies... 

In [3]:
!wc -l scratch/subsample.frq.strat
!head scratch/subsample.frq.strat

7536595 scratch/subsample.frq.strat
 CHR           SNP     CLST   A1   A2      MAF    MAC  NCHROBS
   1      1.842013    !Xuun    G    T   0.4231     11       26 
   1      1.842013       AA    G    T     0.15      3       20 
   1      1.842013   Adygei    G    T   0.3125     10       32 
   1      1.842013 Anatolia_N    G    T   0.2143      6       28 
   1      1.842013 Armenian    G    T     0.35      7       20 
   1      1.842013 Assyrian    G    T   0.1364      3       22 
   1      1.842013   Balkar    G    T      0.2      4       20 
   1      1.842013  Balochi    G    T   0.2273     10       44 
   1      1.842013 BantuKenya    G    T      0.1      2       20 


In [4]:
import itertools as it
import pandas as pd
import numpy as np
import momi
import logging, sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [5]:
def read_plink_frq_strat(fname, ancestral_pop, chunk_size = 10000):
    def get_chunks(locus_id, locus_rows):
        snps_grouped = it.groupby(locus_rows, lambda r: r[1])
        snps_enum = enumerate(list(snp_rows) for snp_id, snp_rows in snps_grouped)            

        for chunk_num, chunk in it.groupby(snps_enum, lambda idx_snp_pair: idx_snp_pair[0] // chunk_size):
            chunk = pd.DataFrame(list(it.chain.from_iterable(snp for snp_num, snp in chunk)),
                                columns = header)
            for col_name, col in chunk.ix[:,("MAC","NCHROBS")].items():
                chunk[col_name] = [int(x) for x in col]

            # check A1, A2 agrees for every row of every SNP
            for a in ("A1","A2"):
                assert all(len(set(snp[a])) == 1 for _,snp in chunk.groupby(["CHR","SNP"]))

            # replace allele name with counts
            chunk["A1"] = chunk["MAC"]
            chunk["A2"] = chunk["NCHROBS"] - chunk["A1"]

            # drop extraneous columns, label indices
            chunk = chunk.ix[:,["SNP","CLST","A1","A2"]]
            chunk.set_index(["SNP","CLST"], inplace=True)
            chunk.columns.name = "Allele"

            ## convert to 3d array (panel)
            chunk = chunk.stack("Allele").unstack("SNP").to_panel()
            assert chunk.shape[2] == 2
            populations = list(chunk.axes[1])
            chunk = chunk.values

            ## polarize
            # remove ancestral population
            anc_pop_idx = populations.index(ancestral_pop)
            anc_counts = chunk[:,anc_pop_idx,:]
            chunk = np.delete(chunk, anc_pop_idx, axis=1)
            populations.pop(anc_pop_idx)
            # check populations are same as sampled_pops
            if not sampled_pops:
                sampled_pops.extend(populations)
            assert sampled_pops == populations

            is_ancestral = [(anc_counts[:,allele] > 0) & (anc_counts[:,other_allele] == 0)
                           for allele, other_allele in ((0,1),(1,0))]

            assert np.all(~(is_ancestral[0] & is_ancestral[1]))
            chunk[is_ancestral[1],:,:] = chunk[is_ancestral[1],:,::-1]
            chunk = chunk[is_ancestral[0] | is_ancestral[1],:,:]

            # remove monomorphic sites
            polymorphic = (chunk.sum(axis=1) > 0).sum(axis=1) == 2
            chunk = chunk[polymorphic,:,:]

            yield chunk
        logging.info("Finished reading CHR {}".format(locus_id))

    with open(fname) as f:
        rows = (l.split() for l in f)
        header = next(rows)
        assert header[:2] == ["CHR", "SNP"]

        loci = (it.chain.from_iterable(get_chunks(locus_id, locus_rows))
                   for locus_id, locus_rows in it.groupby(rows, lambda r: r[0]))
        
        # sampled_pops is not read until the first chunk is processed
        sampled_pops = []        
        first_loc = next(loci)
        first_chunk = next(first_loc)
        
        # add the first chunk/locus back onto the iterators
        first_loc = it.chain([first_chunk], first_loc)
        loci = it.chain([first_loc], loci)
        
        return momi.seg_site_configs(sampled_pops, loci)

In [6]:
seg_sites = read_plink_frq_strat("scratch/subsample.frq.strat", "Chimp")

INFO:root:Finished reading CHR 1
INFO:root:Finished reading CHR 2
INFO:root:Finished reading CHR 3
INFO:root:Finished reading CHR 4
INFO:root:Finished reading CHR 5
INFO:root:Finished reading CHR 6
INFO:root:Finished reading CHR 7
INFO:root:Finished reading CHR 8
INFO:root:Finished reading CHR 9
INFO:root:Finished reading CHR 10
INFO:root:Finished reading CHR 11
INFO:root:Finished reading CHR 12
INFO:root:Finished reading CHR 13
INFO:root:Finished reading CHR 14
INFO:root:Finished reading CHR 15
INFO:root:Finished reading CHR 16
INFO:root:Finished reading CHR 17
INFO:root:Finished reading CHR 18
INFO:root:Finished reading CHR 19
INFO:root:Finished reading CHR 20
INFO:root:Finished reading CHR 21
INFO:root:Finished reading CHR 22


In [8]:
with open("scratch/ancestral_counts.txt","w") as f:
    print("CHR", *seg_sites.sampled_pops, file=f)
    for i,loc in enumerate(seg_sites):
        for site in loc:
            print(i+1, *site[:,0], file=f)

In [9]:
with open("scratch/derived_counts.txt","w") as f:
    print("CHR", *seg_sites.sampled_pops, file=f)
    for i,loc in enumerate(seg_sites):
        for site in loc:
            print(i+1, *site[:,1], file=f)

In [10]:
!diff scratch/ancestral_counts.txt scratch/ancestral_counts.txt.bak

In [11]:
!diff scratch/derived_counts.txt scratch/derived_counts.txt.bak