In [217]:
import numpy as np
import pandas as pd
import zarr
import allel
import pybedtools

In [218]:
samples = pd.read_csv("~/ag1000g/data/samples.meta.txt", sep="\t")

As the current method of running LDNe in the snakemake pipeline requires subsetting the VCF (very slow), I will instead write a script to convert the zarr to .dat format for LDNe.

Steps
- remove pericentromeric regions
- select non-coding regions (use gff3)
- downsample (random)
- convert to .dat format 

In [236]:
chrom='3L'

Ag_store = zarr.open_array(f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/", mode = 'r')
positions = allel.SortedIndex(zarr.open_array(f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/POS", mode='r'))

ag_geno = allel.GenotypeChunkedArray(Ag_store)

In [237]:
#filter the gff3 to be coding and regulatory regions
df = allel.gff3_to_dataframe("/home/sanj/ag1000g/data/reference/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3")
coding_reg_df = df[~df.type.isin(['chromosome', 'three_prime_UTR','five_prime_UTR',
                  'mRNA', 'CDS', 'exon'])].drop(columns=['source', 'strand', 'phase', 'score'])
coding_reg_df = coding_reg_df[coding_reg_df.seqid == chrom]

centromere = (positions > 2000000) if chrom == '3L' else (positions > 50000000)
positions = positions[centromere]
#get boolean array for positions that are coding
coding = positions.locate_ranges(coding_reg_df.start, coding_reg_df.end, strict=False)

#compress to get noncoding SNPs
ag_geno = ag_geno.compress(centromere, axis=0)
ag_geno = ag_geno.compress(~coding, axis=0)

Remove centromeric regions of low recombination

Converting to .dat format for LDNe

In [238]:
pops = samples.population.unique()

In [239]:
pops = pops[2:3]

In [255]:
for pop in pops:
    pop_bool = samples.population == pop
    pop_geno = ag_geno.compress(pop_bool, axis=1) ; print

    ac = pop_geno.count_alleles()
    seg = ac.is_segregating()
    pop_geno = pop_geno.compress(seg, axis=0)

    n = 10000  # number of SNPs to choose randomly
    vidx = np.random.choice(pop_geno.shape[0], n, replace=False)
    vidx.sort()
    gnr = pop_geno.take(vidx, axis=0)
    gnr = np.array(gnr[:])
    gnr = gnr.astype(str)

    positions = positions[~coding]
    positions = positions[seg]
    positions = positions[vidx]
    prefix = f'{chrom}_'
    pos_string = [prefix + pos for pos in positions.astype(str)]
    
    gnr[gnr == '-1'] = '00'
    dat = np.empty([gnr.shape[0], gnr.shape[1]])

    for x in range(gnr.shape[0]):
        for y in range(gnr.shape[1]):
            dat[x,y] = ''.join(gnr[x,y])

    dat = dat.astype(str)
    dat[dat == '0.0'] = '0101'
    dat[dat == '1.0'] = '0102'
    dat[dat == '2.0'] = '0103'
    dat[dat == '3.0'] = '0104'
    dat[dat == '10.0'] = '0201'
    dat[dat == '11.0'] = '0202'
    dat[dat == '12.0'] = '0203'
    dat[dat == '13.0'] = '0204'
    dat[dat == '20.0'] = '0301'
    dat[dat == '21.0'] = '0302'
    dat[dat == '22.0'] = '0303'
    dat[dat == '23.0'] = '0304'
    dat[dat == '30.0'] = '0401'
    dat[dat == '31.0'] = '0402'
    dat[dat == '32.0'] = '0403'
    dat[dat == '33.0'] = '0404'

    popnames = np.repeat(f"{pop}_{chrom}", n)
    dat = np.column_stack((popnames, dat)) #
    
    with open(f'{pop}_{chrom}.dat', 'w') as datfile:
        datfile.write(f'{gnr.shape[1]}\t{gnr.shape[0]}\t4\t2\n')
        datfile.write("\n".join("".join(map(str, x)) for x in pos_string)) 
        datfile.write("\n")
        datfile.write("\n".join(" ".join(map(str, x)) for x in dat))    