In [1]:
import malariagen_data
import numpy as np
import pandas as pd
import allel
from datetime import date

import sys
# adding Folder_2 to the system path
sys.path.insert(0, '/home/sanj/projects/gaardian/workflow/scripts/')
import probetools as probe

#### MVNcall - preparing data

In [2]:
dataset = 'coeae1f'

cohorts = [
    # Ag1000G phase 3 sample sets in Ag3.0
    "AG1000G-GH", 
    'AG1000G-ML-A',
     'AG1000G-BF-A',
     'AG1000G-BF-B',
     'AG1000G-GN-A',
     'AG1000G-GN-B',
    'AG1000G-TZ',
    # Amenta-Etego sample sets in Ag3.3
    # GAARDIAN sample set in Ag3.4
    '1244-VO-GH-YAWSON-VMF00149',
    # GAARD Ghana sample set in Ag3.2
     "1244-VO-GH-YAWSON-VMF00051",
     '1245-VO-CI-CONSTANT-VMF00054',
     '1253-VO-TG-DJOGBENOU-VMF00052',
     '1237-VO-BJ-DJOGBENOU-VMF00050'
]


contig = '2L'
start = 28_520_000
end = 28_580_000

region = f"{contig}:{start}-{end}"
region

'2L:28520000-28580000'

In [3]:
ag3 = malariagen_data.Ag3(pre=True)

Load SNP calls. 

In [4]:
calls = ag3.snp_calls(region=region, sample_sets=cohorts)

ac = allel.GenotypeArray(calls['call_genotype']).count_alleles()
seg = ac.is_segregating()

calls = calls.sel(variants=seg)
alleles = calls['variant_allele'].compute()
geno = allel.GenotypeArray(calls['call_genotype'])
pos = allel.SortedIndex(calls['variant_position'])

In [5]:
nonsynons = np.array([])
for i in range(27,28):
    if i == 30: continue
    transcript_id = f"AGAP0062{i}-RA"
    
    snp_allele_freqs_df = ag3.snp_allele_frequencies(
        transcript=transcript_id, 
        cohorts="admin1_year", 
        sample_sets=cohorts, 
        drop_invariant=False,
    )
    df = snp_allele_freqs_df.query("max_af > 0.05 & effect == 'NON_SYNONYMOUS_CODING'")
    nonsynons = np.append(nonsynons, df.reset_index().loc[:, 'position'].to_list())

Load sample metadata:   0%|          | 0/12 [00:00<?, ?it/s]

Load SNP genotypes:   0%|          | 0/106 [00:00<?, ?it/s]

Compute allele frequencies:   0%|          | 0/35 [00:00<?, ?it/s]

Compute SNP effects:   0%|          | 0/7629 [00:00<?, ?it/s]

In [11]:
metadata = ag3.sample_metadata(sample_sets=cohorts)

gl_mapping = {'0/0':0, 
                  '0/1':1, 
                  '1/1':2, 
                  '0/2':3, 
                  '1/2':4, 
                  "2/2":5, 
                  '0/3':6, 
                  '1/3':7, 
                  '2/3':8, 
                  '3/3':9}

def code_genotype_likelihoods(genotype, gl_mapping):
        
    gl = np.repeat(25, 10).astype(str)
    if genotype == './.':
        return(','.join(gl))
    
    gl[gl_mapping[genotype]] = 0
    
    return(','.join(gl))


def write_vcf_header(vcf_file, contig):
    """
    Writes a VCF header.
    """
    
    print('##fileformat=VCFv4.1', file=vcf_file)
    # write today's date
    today = date.today().strftime('%Y%m%d')
    print('##fileDate=%s' % today, file=vcf_file)
    # write source
    print('##source=scikit-allel-%s + ZarrToVCF.py' % allel.__version__, file=vcf_file)
    #write refs and contigs 
    print('##reference=resources/reference/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa', file=vcf_file)
    print('##contig=<ID=2R,length=61545105>', file=vcf_file) if contig == '2R' else None
    print('##contig=<ID=3R,length=53200684>', file=vcf_file) if contig == '3R' else None 
    print('##contig=<ID=2L,length=49364325>', file=vcf_file) if contig == '2L' else None
    print('##contig=<ID=3L,length=41963435>', file=vcf_file) if contig == '3L' else None
    print('##contig=<ID=X,length=24393108>', file=vcf_file) if contig == 'X' else None
    print('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', file=vcf_file)
    print("##FORMAT=<ID=PL,Number=.,Type=Integer,Description='Phred-scaled Genotype Likelihoods'>", file=vcf_file)


def GenoToPandasToVCF(vcf_file, geno, positions, alleles, contig, nchunks=4):
    
    """
    Converts genotype and POS arrays to vcf, using pd dataframes in chunks. 
    Segregating sites only. Needs REF and ALT arrays.
    """
    refs = alleles[:,0]
    alts = alleles[:,1:]
    refs = refs.astype(str)
    alts = [a +"," + b + "," + c for a,b,c in alts.values.astype(str)]

    probe.log("calculating chunks sizes...")
    chunks = np.round(np.arange(0, geno.shape[0], geno.shape[0]/nchunks)).astype(int).tolist()
    chunks.append(geno.shape[0])

    for idx, chunk in enumerate(chunks[:-1]):

        gn = geno[chunks[idx]:chunks[idx+1]]
        pos = positions[chunks[idx]:chunks[idx+1]]
        ref = refs[chunks[idx]:chunks[idx+1]]
        alt = alts[chunks[idx]:chunks[idx+1]]
        
        # Contruct SNP info DF
        vcf_df = pd.DataFrame({'#CHROM': contig,
                 'POS': pos,
                 'ID': '.',
                 'REF': ref,
                 'ALT': alt,
                 'QUAL': '.',
                 'FILTER': '.',
                 'INFO':'.',
                'FORMAT': 'GT:PL'})

        probe.log(f"Pandas SNP info DataFrame constructed...{idx}")

        # Geno to VCF
        vcf = pd.DataFrame(gn.to_gt().astype(str), columns=metadata['sample_id'])
        gl_df = vcf.applymap(lambda x: code_genotype_likelihoods(x, gl_mapping))
        vcf = vcf + ":" + gl_df
        
        probe.log("Concatenating info and genotype dataframes...")
        vcf = pd.concat([vcf_df, vcf], axis=1)

        probe.log(f"Pandas Genotype data constructed...{idx}")

        if (idx==0) is True:
            with open(f"{vcf_file}", 'w') as vcfheader:
                    write_vcf_header(vcfheader, contig)

        probe.log("Writing to .vcf")

        vcf.to_csv(vcf_file, 
                   sep="\t", 
                   index=False,
                   mode='a',
                  header=(idx==0), 
                  line_terminator="\n")

In [7]:
pos_bool = pos.locate_intersection(nonsynons)
geno = geno.compress(pos_bool[0], axis=0)
alleles = alleles[pos_bool[0]]
pos = pos[pos_bool[0]]

Subset the non-synon SNPs to be ones that are not present in the haplotype data!

In [8]:
haps = ag3.haplotypes(region=region, sample_sets=cohorts, analysis='gamb_colu_arab')
haps_pos = haps['variant_position'].compute().values
pos_inhaps_bool = np.isin(pos, haps_pos)
geno = geno.compress(~pos_inhaps_bool, axis=0)
pos = pos[~pos_inhaps_bool]

In [13]:
vcf_df = GenoToPandasToVCF(f"../../results/phasing/{dataset}.genotypes.vcf", geno, pos, alleles, contig='2L')

calculating chunks sizes...
Pandas SNP info DataFrame constructed...0
Concatenating info and genotype dataframes...
Pandas Genotype data constructed...0
Writing to .vcf
Pandas SNP info DataFrame constructed...1
Concatenating info and genotype dataframes...
Pandas Genotype data constructed...1
Writing to .vcf
Pandas SNP info DataFrame constructed...2
Concatenating info and genotype dataframes...
Pandas Genotype data constructed...2
Writing to .vcf
Pandas SNP info DataFrame constructed...3
Concatenating info and genotype dataframes...
Pandas Genotype data constructed...3
Writing to .vcf


GL : genotype likelihoods comprised of comma separated floating point log10-scaled likelihoods for all possible
genotypes given the set of alleles defined in the REF and ALT fields. In presence of the GT field the same
ploidy is expected and the canonical order is used; without GT field, diploidy is assumed. If A is the allele in
REF and B,C,... are the alleles as ordered in ALT, the ordering of genotypes for the likelihoods is given by:
F(j/k) = (k*(k+1)/2)+j. In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the
ordering is: AA,AB,BB,AC,BC,CC, etc. For example: GT:GL 0/1:-323.03,-99.29,-802.53 (Floats)

In [78]:
gl_df = vcf_df.applymap(lambda x: code_genotype_likelihoods(x, gl_mapping))

In [76]:
code_genotype_likelihoods("1/3", gl_mapping)

'25,25,25,25,25,25,25,0,25,25'

### Scaffolds file

In [16]:
sample_id_order_vcf = metadata['sample_id'].to_numpy()

In [33]:
haps = ag3.haplotypes(region=region, sample_sets=cohorts, analysis='gamb_colu_arab')
sample_id_order_haps = haps['sample_id'].compute().values
bool_ = np.isin(sample_id_order_haps, sample_id_order_vcf)
hap_ac = allel.GenotypeArray(haps['call_genotype']).count_alleles()
hap_seg = hap_ac.is_segregating()
haps = haps.sel(variants=hap_seg,samples=bool_)

In [34]:
alleles = haps['variant_allele'].compute().astype(str)

In [35]:
ids = np.array(["SNP" + a for a in haps['variant_position'].values.astype(str)])

In [36]:
haps_df = pd.DataFrame({'#CHROM': contig,
        'ID': ids,
        'POS': haps['variant_position'].values,
        'REF': alleles[:,0],
        'ALT': alleles[:,1]})
        
haps_geno = allel.GenotypeArray(haps['call_genotype']).to_haplotypes()

haps_geno_df = pd.DataFrame(haps_geno)
allhaps = pd.concat([haps_df, haps_geno_df], axis=1)

In [37]:
pd.Series(haps['sample_id'].values).to_csv(f"results/phasing/{dataset}.sample", index=False, header=False)

In [39]:
allhaps.to_csv(f"results/phasing/{dataset}.haps", sep="\t", header=False, index=False)

In [40]:
allhaps

Unnamed: 0,#CHROM,ID,POS,REF,ALT,0,1,2,3,4,...,4396,4397,4398,4399,4400,4401,4402,4403,4404,4405
0,2L,SNP28520002,28520002,C,T,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2L,SNP28520008,28520008,A,G,1,1,1,1,0,...,0,0,0,1,1,1,1,1,1,1
2,2L,SNP28520016,28520016,A,C,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2L,SNP28520017,28520017,G,A,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2L,SNP28520022,28520022,T,C,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12063,2L,SNP28579975,28579975,C,T,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12064,2L,SNP28579976,28579976,G,T,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
12065,2L,SNP28579986,28579986,C,T,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12066,2L,SNP28579991,28579991,G,A,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
