Skip to content

Commit

Permalink
Pull in Encore's changes to Manhattan json-maker
Browse files Browse the repository at this point in the history
  • Loading branch information
pjvandehaar committed Feb 16, 2017
1 parent 7fce35c commit 029baa8
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 87 deletions.
196 changes: 109 additions & 87 deletions pheweb/load/make_manhattan.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@

'''
This script creates json files which can be used to render Manhattan plots.
'''

# TODO: copy MAX_NUM_UNBINNED from ENCORE
# TODO: combine with QQ
# TODO: combine with QQ?

'''
copying from Encore:
- Do I want JSONOutFile? Does it make code shorter?
- Can I keep process_file() as-is and make run() Pool.map() it?
- then add a decorator "star_args_kwargs"
'''


from .. import utils
Expand All @@ -14,39 +22,21 @@
import datetime
import multiprocessing
import csv
import collections
import heapq
import boltons.fileutils


BIN_LENGTH = int(3e6)
NEGLOG10_PVAL_BIN_SIZE = 0.05 # Use 0.05, 0.1, 0.15, etc
NEGLOG10_PVAL_BIN_DIGITS = 2 # Then round to this many digits
BIN_THRESHOLD = 1e-4 # pvals less than this threshold don't get binned.
def rounded_neglog10(pval, neglog10_pval_bin_size, neglog10_pval_bin_digits):
return round(-math.log10(pval) // neglog10_pval_bin_size * neglog10_pval_bin_size, neglog10_pval_bin_digits)

# TODO: just switch to pandas.read_csv
def get_variants(f):
reader = csv.DictReader(f, delimiter='\t')
for v in reader:
v['pos'] = int(v['pos'])
v['maf'] = float(v['maf'])
try:
v['pval'] = float(v['pval'])
except ValueError:
v['pval'] = 1
for key in ['beta', 'sebeta']:
try:
v[key] = float(v[key])
except (ValueError, KeyError):
continue
yield v

def rounded_neglog10(pval):
return round(-math.log10(pval) // NEGLOG10_PVAL_BIN_SIZE * NEGLOG10_PVAL_BIN_SIZE, NEGLOG10_PVAL_BIN_DIGITS)

def get_pvals_and_pval_extents(pvals):
def get_pvals_and_pval_extents(pvals, neglog10_pval_bin_size):
# expects that NEGLOG10_PVAL_BIN_SIZE is the distance between adjacent bins.
pvals = sorted(pvals)
extents = [[pvals[0], pvals[0]]]
for p in pvals:
if extents[-1][1] + NEGLOG10_PVAL_BIN_SIZE * 1.1 > p:
if extents[-1][1] + neglog10_pval_bin_size * 1.1 > p:
extents[-1][1] = p
else:
extents.append([p,p])
Expand All @@ -58,90 +48,122 @@ def get_pvals_and_pval_extents(pvals):
rv_pval_extents.append([start,end])
return (rv_pvals, rv_pval_extents)

def bin_variants(variants):
bins = []
unbinned_variants = []

for variant in variants:
if variant['pval'] < BIN_THRESHOLD:
variant['maf'] = utils.round_sig(variant['maf'], 3)
variant['pval'] = utils.round_sig(variant['pval'], 2)
unbinned_variants.append(variant)
# TODO: convert bins from {(chrom, pos): []} to {chrom:{pos:[]}}?
def bin_variants(variant_iterator, bin_length, n_unbinned, neglog10_pval_bin_size, neglog10_pval_bin_digits):
bins = {}
unbinned_variant_heap = []
chrom_n_bins = {}

def box(x):
return (-x.pval, x)

def unbox(x):
return x[1]

def bin_variant(variant):
chrom = variant.chrom
chrom_key = utils.chrom_order[chrom]
pos_bin = variant.pos // bin_length
chrom_n_bins[chrom_key] = max(chrom_n_bins.get(chrom_key,0), pos_bin)
if (chrom_key, pos_bin) in bins:
bin = bins[(chrom_key, pos_bin)]

else:
if len(bins) == 0 or variant['chrom'] != bins[-1]['chrom']:
# We need a new bin, starting with this variant.
bins.append({
'chrom': variant['chrom'],
'startpos': variant['pos'],
'neglog10_pvals': set(),
})
elif variant['pos'] > bins[-1]['startpos'] + BIN_LENGTH:
# We need a new bin following the last one.
bins.append({
'chrom': variant['chrom'],
'startpos': bins[-1]['startpos'] + BIN_LENGTH,
'neglog10_pvals': set(),
})
bins[-1]['neglog10_pvals'].add(rounded_neglog10(variant['pval']))

bins = [b for b in bins if len(b['neglog10_pvals']) != 0]
for b in bins:
b['neglog10_pvals'], b['neglog10_pval_extents'] = get_pvals_and_pval_extents(b['neglog10_pvals'])
b['pos'] = int(b['startpos'] + BIN_LENGTH/2)
del b['startpos']

return bins, unbinned_variants


def label_genes_to_show(variants):
variants_by_gene = {}
for v in variants:
if v['pval'] < 1e-5: # This is my arbitrary cutoff.
for gene in v['nearest_genes'].split(','):
variants_by_gene.setdefault(gene, []).append(v)

for variants_in_gene in variants_by_gene.values():
best_variant = min(variants_in_gene, key=lambda v: v['pval'])
best_variant['show_gene'] = True


def make_json_file(args):
src_filename, dest_filename, tmp_filename = args['src'], args['dest'], args['tmp']
bin = {"chrom": chrom,
"startpos": pos_bin * bin_length,
"neglog10_pvals": set()}
bins[(chrom_key, pos_bin)] = bin
bin["neglog10_pvals"].add(rounded_neglog10(variant.pval, neglog10_pval_bin_size, neglog10_pval_bin_digits))

# put most-significant variants into the heap and bin the rest
for variant in variant_iterator:
heapq.heappush(unbinned_variant_heap, box(variant))
if len(unbinned_variant_heap) > n_unbinned:
old = heapq.heappop(unbinned_variant_heap)
bin_variant(unbox(old))

unbinned_variants = []
for variant in (unbox(x) for x in unbinned_variant_heap):
rec = variant.other
rec['chrom'] = variant.chrom
rec['pos'] = variant.pos
rec['pval'] = utils.round_sig(variant.pval, 2)
unbinned_variants.append(rec)

# unroll bins into simple array (preserving chromosomal order)
binned_variants = []
for chrom_key in sorted(chrom_n_bins.keys()):
for pos_key in range(int(1+chrom_n_bins[chrom_key])):
b = bins.get((chrom_key, pos_key), None)
if b and len(b['neglog10_pvals']) != 0:
b['neglog10_pvals'], b['neglog10_pval_extents'] = get_pvals_and_pval_extents(b['neglog10_pvals'], neglog10_pval_bin_size)
b['pos'] = int(b['startpos'] + bin_length/2)
del b['startpos']
binned_variants.append(b)

return binned_variants, unbinned_variants


AssocResult = collections.namedtuple('AssocResult', 'chrom pos pval other'.split())
def get_variants(f):
reader = csv.DictReader(f, delimiter='\t')
for v in reader:
chrom = v.pop('chrom')
pos = int(v.pop('pos'))
try:
pval = float(v.pop('pval'))
except ValueError:
continue
for key in ['maf', 'beta', 'sebeta']:
try:
v[key] = float(v[key])
except (ValueError, KeyError):
continue
yield AssocResult(chrom, pos, pval, v)


@utils.star_kwargs
def make_json_file(src_filename, dest_filename):

BIN_LENGTH = int(3e6)
NEGLOG10_PVAL_BIN_SIZE = 0.05 # Use 0.05, 0.1, 0.15, etc
NEGLOG10_PVAL_BIN_DIGITS = 2 # Then round to this many digits
N_UNBINNED = 2000

with open(src_filename) as f:
variants = get_variants(f)
variant_bins, unbinned_variants = bin_variants(variants)

label_genes_to_show(unbinned_variants)
variant_bins, unbinned_variants = bin_variants(
variants, BIN_LENGTH, N_UNBINNED, NEGLOG10_PVAL_BIN_SIZE, NEGLOG10_PVAL_BIN_DIGITS)

rv = {
'variant_bins': variant_bins,
'unbinned_variants': unbinned_variants,
}

# Avoid getting killed while writing dest_filename, to stay idempotent despite me frequently killing the program
with open(tmp_filename, 'w') as f:
json.dump(rv, f, sort_keys=True, indent=0)
f.flush()
os.fsync(f.fileno()) # Recommended by <http://stackoverflow.com/a/2333979/1166306>
tmp_fname = os.path.join(conf.data_dir, 'tmp', 'manhattan-' + os.path.basename(dest_filename))
with boltons.fileutils.AtomicSaver(dest_filename, text_mode=True, part_file=tmp_fname) as f:
json.dump(rv, f)

print('{}\t{} -> {}'.format(datetime.datetime.now(), src_filename, dest_filename))
os.rename(tmp_filename, dest_filename)


def get_conversions_to_do():
phenocodes = [pheno['phenocode'] for pheno in utils.get_phenolist()]
for phenocode in phenocodes:
src_filename = os.path.join(conf.data_dir, 'augmented_pheno', phenocode)
dest_filename = os.path.join(conf.data_dir, 'manhattan', '{}.json'.format(phenocode))
tmp_filename = os.path.join(conf.data_dir, 'tmp', 'manhattan-{}.json'.format(phenocode))
if not os.path.exists(dest_filename) or os.stat(dest_filename).st_mtime < os.stat(src_filename).st_mtime:
yield {'src':src_filename, 'dest':dest_filename, 'tmp':tmp_filename}
yield {
'src_filename': src_filename,
'dest_filename': dest_filename,
}


def run(argv):

utils.mkdir_p(conf.data_dir + '/manhattan')
utils.mkdir_p(conf.data_dir + '/tmp')
boltons.fileutils.mkdir_p(conf.data_dir + '/manhattan')
boltons.fileutils.mkdir_p(conf.data_dir + '/tmp')

conversions_to_do = list(get_conversions_to_do())
print('number of phenos to process:', len(conversions_to_do))
Expand Down
8 changes: 8 additions & 0 deletions pheweb/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import imp
import multiprocessing
import csv
import contextlib

conf = attrdict.AttrDict() # this gets populated by `ensure_conf_is_loaded()`, which is run-once and called at the bottom of this module.

Expand Down Expand Up @@ -218,6 +219,13 @@ def f2(*args, **kwargs):
return f2


def star_kwargs(f):
@functools.wraps(f)
def f2(kwargs):
return f(**kwargs)
return f2


def all_equal(iterator):
if isinstance(iterator, list): iterator = iter(iterator)
try:
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,6 @@ def readme():
'attrdict~=2.0',
'requests~=2.13',
'gunicorn~=19.6',
'boltons~=17.0`,
]
)

0 comments on commit 029baa8

Please sign in to comment.