In [43]:
%run imports.ipynb
from os.path import join, isdir, isfile
import sh

In [44]:
root_dir = "/home/jovyan/notebooks/admixture"
wd = join(root_dir, "preprocessed_data", "PLINK_v1.07")
sh.mkdir("-p", wd)



In [45]:
def estimate_gdist(pos, table):
    which = np.searchsorted(table.pposition.tolist(), pos + 1)    
    gap = pos - table.iloc[which - 1].pposition
    rate = table.iloc[which - 1].rrate
    return np.round(table.iloc[which - 1].gposition + ((gap/1e6) * rate), 7)

In [46]:
def loadmap(path):
    mapdata = pd.read_csv(path, sep="\t", index_col=None)
    return mapdata

In [47]:
mapfn = join("admixture/map/Ag_{chrom}.map")

In [48]:
from anhima import loc, gt
import gzip

In [49]:
meta_fn = "samples.meta.txt"
dat = pd.read_csv(meta_fn, sep="\t", index_col=0)

In [50]:
sex_d = {"F": "2", "M": "1"}
fstem = "{chrom}_{desc}_{start}_{stop}"

In [51]:
callset_fn = callset_biallel

In [52]:
script_d = join(wd, "_script")
log_d = join(wd, "_log")
sh.mkdir("-p", script_d, log_d)



In [53]:
plink_cmd = """
#! /bin/bash

set -e 
set -o pipefail

cd {wd}
gunzip {file}.ped.gz {file}.map.gz
plink --noweb --file {file} --out {file}.ld --indep-pairwise {nsnps} {slide} {r2}

shuf -n 200000 {file}.ld.prune.in | sort > {file}.ld.prune.in.downsample

plink --noweb --file {file} --extract {file}.ld.prune.in.downsample \\
  --out {file}.ld.pruned --make-bed
plink --noweb --file {file} --extract {file}.ld.prune.in.downsample \\
  --out {file}.ld.pruned --recode

gzip {file}.ped {file}.map
"""

In [54]:
maf_min = 0.01
r2_value = 0.1
downsample_n = 100000

In [55]:
regions = (("3R", 1000000, 37000000, "free"),
           ("3L", 15000000, 41000000, "free"))

In [56]:
for chrom, start, stop, description in regions:
    print(chrom, description)
    fh = callset_fn[chrom]
    fh_samples = [str(s) for s in callset_fn[chrom]["samples"][:]]

    region_stem = fstem.format(chrom=chrom, start=start, stop=stop, desc=description)

    # plink command
    cmd = plink_cmd.format(file=region_stem, wd=wd, nsnps=500, slide=100, r2=r2_value)
    sfn = join(script_d, "PLINK_" + region_stem + ".sh")
    with open(sfn, "w") as sf:
        print(cmd, file=sf)
    
    positions = fh['variants']['POS'][:]
    loci = (positions >= start) & (positions <= stop)
    
    g = allel.GenotypeChunkedArray(fh['calldata/GT']).compress(loci, axis=0)
        
    pos = np.compress(loci, positions)
    positions = None
    
    alleles = g.count_alleles()
    
    biallelic = alleles.max_allele() <= 1
    g = g.compress(biallelic, axis=0)
    pos = np.compress(np.array(biallelic), pos)
    
    freqs = g.count_alleles().to_frequencies()
    maf_ok = np.min(np.array(freqs), axis=1) >= maf_min
    g = g.compress(maf_ok, axis=0)
    pos = np.compress(maf_ok, pos, axis=0)

    if pos.size > downsample_n:
        print("Downsampling... to {0} from {1}".format(downsample_n, pos.size))
        idx = np.random.choice(np.arange(0, pos.size), downsample_n, False)
        idx.sort()
        pos = np.take(pos, idx)
        g = g.take(idx, axis=0)
    
    # create the ped file
    fn = join(wd, region_stem + ".ped.gz")
    if not isfile(fn + ".ok"):
        with gzip.open(fn, "wb") as gz:
            for i, sid in enumerate(fh_samples):
                sex = sex_d[dat.loc[sid].sex]
                if (chrom == "X") and (sex == "1"):
                    continue
                geno = np.array(g[:, i] + 1)
                geno_str = " ".join(np.apply_along_axis(" ".join, 1, geno.astype("str")))
                line = " ".join([sid, sid, "0", "0", sex, "0", geno_str]) + "\n"
                gz.write(line.encode())
        
        sh.touch(join(wd, fn + ".ok"))

3R free
Downsampling... to 100000 from 1349635
3L free
Downsampling... to 100000 from 938716


------------------------------