## Setup

In [1]:
#!pip install -q --no-warn-conflicts malariagen_data -U

This file accesses MalariaGen Pf7 data QC pass data at the variant level to calculate site missingness across all variants. It accesses Pf7 data stored via the cloud, which requires registration on the malariagen website and subsequent gcloud logins during each session. 

In [None]:
!gcloud auth application-default login

In [2]:
import numpy as np
import dask
import dask.array as da
from dask.diagnostics.progress import ProgressBar
import allel
# silence some warnings
dask.config.set(**{'array.slicing.split_large_chunks': False})
import malariagen_data
import collections
import pandas as pd
import xarray as xr

pf7 = malariagen_data.Pf7()
pf7_metadata = pf7.sample_metadata()
#pf7_metadata = pd.read_csv('Pf7_samples.txt', sep = '\t')
variant_dataset = pf7.variant_calls()
genome_features0 = pf7.genome_features()
# EXTENDED VARIANT DATA -- variant_ANN_Annotation_Impact
extended_variant_dataset = pf7.variant_calls(extended=True)

In [10]:
def pass_rate(stage):
  # read in stage-specific gene set
  #fname = stage + '.txt'
  #gene_set = np.loadtxt(fname, dtype = str)
  gene_set = stage
  print(stage, len(gene_set))
  # mark which genes are in gene set from BOOL
  in_set = [(i in gene_set) for i in genome_features0['ID']]
  # subset gene set features from list of all 3D7 genome features
  genome_features = genome_features0[in_set]
  contigs = genome_features['contig']
  start = genome_features['start']
  end = genome_features['end']

  # APPROACH: PREPARE ALL MASKS AND COMBINE THEM TOGETHER AT END
  # load in filter pass flags
  fpass = variant_dataset['variant_filter_pass'].data
  # variant pass dataset --> keep SNPs only (same for all)
  # first: snp mask (filter array for SNPs only)
  snp_mask = variant_dataset['variant_is_snp'].data

  # coding mask: ensure variant falls in coding region
  coding_mask = variant_dataset['variant_CDS'].data

  pass_rates = []
  # iterate through all gene coordinates pulled from reference genome
  for i, j, k in zip(contigs, start, end):
    # mask those on same contig
    contig_mask = variant_dataset['variant_chrom'].data == i
    # mask for position
    pos_mask = da.isin(variant_dataset['variant_position'].data, np.arange(j, k))
    # combine masks
    gene_mask = da.logical_and(contig_mask, pos_mask)
    gene_mask_coding = da.logical_and(gene_mask, coding_mask)
    mask_final = da.logical_and(snp_mask, gene_mask_coding)
    fpass_gene = fpass[mask_final.compute()]
    pass_rate0 = da.sum(fpass_gene).compute()/fpass_gene.size
    pass_rates.append(pass_rate0)
    #print(pass_rate0)
  return(pass_rates)

pass_rate('PF3D7_0829600')

PF3D7_0829600 13


[0.8448275862068966]

In [None]:
# read in gene sets, e.g. sets of genes expressed / detectable in assay with preliminary breadth labels 
# these labels will be changes later (after analysis)
stages = ['gene_sets_assay/' + str(i) for i in range(1,7)]

# calculate pass rates for each gene set loaded in 
rates_all = []
for i in stages:
  rates_all.append(pass_rate(i))

rates_all[1]
# save all to csv 
rdf = pd.DataFrame(rates_all).T
rdf.columns = stages
rdf.head()
rdf.to_csv('missing_by_life_breadth.csv')

In [263]:
!grep -v -F -f  gene_sets_all/all.txt props_adj_breadth.txt | wc -l

^C


In [None]:
rates_all = pass_rate('gene_sets_all/all')

In [None]:
# runIF new session for analysis
import pandas as pd
rdf = pd.read_csv('missing_by_life_breadth.csv', index_col=0)
#stages = ['ookinete', 'gametocyte', 'sporozoite', 'schizont', 'trophozoite', 'ring']
stages = range(1,7)
rdf.columns = stages
rdf.head(10)

Unnamed: 0,1,2,3,4,5,6
0,0.0,0.0,0.821739,0.0,0.783599,0.826748
1,0.0,0.680769,0.836914,0.83047,0.657718,0.814491
2,0.0,0.794872,0.734072,0.740634,0.820652,0.784768
3,0.76781,0.864286,0.849462,0.79803,0.836263,0.823819
4,0.844311,0.833333,0.80531,0.805556,0.836237,0.775079
5,0.792453,0.808824,0.753425,0.801158,0.837017,0.789062
6,0.684211,0.757353,0.81383,0.829431,0.835667,0.719178
7,0.786325,0.818966,0.619718,0.801762,0.842844,0.769231
8,0.036269,0.861789,0.75188,0.829604,0.781457,0.853211
9,0.0,0.0,0.764228,0.693548,0.762376,0.796316


In [None]:
# combine with DF containing site counts 
props = pd.read_csv('PlasmoDB-61_Pfalciparum3D7_AnnotatedCDSs_SynAndNonsynSiteCount.txt', sep = '\t')
rdfm = pd.melt(rdf)
rdfm = rdfm[~np.isnan(rdfm['value'])] # filter out junk cells from melting procedure 
rdfm['ID'] = genes['value']
props = props.merge(rdfm, left_on='GENE', right_on='ID')
props['coding_length_adj'] = props['TOTAL_CODING_LENGTH'] * props['value']
props.to_csv('props_adj_breadth.txt', index=False, sep='\t')

In [85]:
props = pd.read_csv("props_adj.txt", sep='\t')
props.head()

Unnamed: 0,GENE,TRANS,NAME,NS,SYN,FFD,TOTAL_CODING_LENGTH,PROP_NS,PROP_SYN,PROP_FFD,STOP_CODONS,FULL_LENGTH,COORD,variable,value,ID,coding_length_adj
0,PF3D7_1137800,PF3D7_1137800.1,sporozoite surface protein essential for liver...,342.833333,113.166667,62,456,0.751827,0.248173,0.135965,1,627,location=Pf3D7_11_v3:1480900-1481526(-),sporozoite,0.844828,PF3D7_1137800,385.241379
1,PF3D7_0932200,PF3D7_0932200.1,profilin,403.333333,112.666667,65,516,0.781654,0.218346,0.125969,1,1365,location=Pf3D7_09_v3:1287008-1288372(+),schizont,0.801178,PF3D7_0932200,413.40774
2,PF3D7_0901500,PF3D7_0901500.1,rifin,853.166667,244.833333,137,1098,0.777019,0.222981,0.124772,1,1338,location=Pf3D7_09_v3:74409-75746(+),sporozoite,0.839071,PF3D7_0901500,921.3
3,PF3D7_1116900,PF3D7_1116900.1,"conserved protein, unknown function",499.0,140.0,79,639,0.780908,0.219092,0.123631,1,1438,location=Pf3D7_11_v3:644100-645537(-),sporozoite,0.754098,PF3D7_1116900,481.868852
4,PF3D7_1245800,PF3D7_1245800.1,"epsin-like protein, putative",1008.833333,287.166667,152,1296,0.778421,0.221579,0.117284,1,1616,location=Pf3D7_12_v3:1908149-1909764(+),gametocyte,0.842589,PF3D7_1245800,1091.995587
