Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
adding simulations working and reformatted
  • Loading branch information
liliblu committed Oct 24, 2019
2 parents 85a8a45 + 4827f08 commit cd09e57
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 27 deletions.
10 changes: 9 additions & 1 deletion blacksheep/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ def _make_parser():
"--output_prefix",
type=_check_output_prefix,
default="simulated_pvals",
help="Output prefix for writing files. Default simulated_pvals. ",
help="Output prefix for writing files. Default is 'simulated_pvals'.",
)
simulations.add_argument(
"--molecules",
Expand All @@ -491,6 +491,13 @@ def _make_parser():
help="List of parent molecules of interest. Empty list or absence of "
"argument defaults to all parent molecules in input file.",
)
simulations.add_argument(
"--pval",
type=_bn0and1,
default=0.05,
help="p-value threshold for significant results. Must be between 0 and 1."
"Default is 0.05.",
)

return parser

Expand Down Expand Up @@ -642,6 +649,7 @@ def _main(args: Optional[List[str]] = None):
int(args.reps),
args.output_prefix,
args.molecules,
args.pval,
)

with open(parameters_file_name % args.output_prefix, "w") as fh:
Expand Down
149 changes: 123 additions & 26 deletions blacksheep/simulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,68 @@
'''

import random
import sys
import argparse
import numpy as np
from scipy.stats import ttest_1samp
from scipy.stats import percentileofscore
from scipy.stats import gaussian_kde

# Argparser, when testing on its own
def _make_parser():
parser = argparse.ArgumentParser(prog="blacksheep", description="")
parser.add_argument("--version", "-v", action="version", version="%(prog)s 0.0.1")

parser.add_argument(
"values",
type=str,
help="File path to input values. Samples are columns and genes/sites are rows. Only .tsv "
"and .csv accepted.",
)
parser.add_argument(
"--ind_sep",
type=str,
default="-",
help="Delimiter between the parent molecule (e.g. a gene name such "
"as ATM) and a site identifier (e.g. S365). Default is -",
)
parser.add_argument(
"--iqrs",
type=float,
default=1.5,
help="Number of inter-quartile ranges (IQRs) above or below the "
"median to consider a value an outlier. Default is 1.5.",
)
parser.add_argument(
"--reps",
type=int,
default=1000000,
help="Number of repetitions for the simulation to perform. "
"Default is 1,000,000.",
)
parser.add_argument(
"--output_prefix",
type=str,
default="simulated_pvals",
help="Output prefix for writing files. Default is 'simulated_pvals'.",
)
parser.add_argument(
"--molecules",
nargs='+',
default=[],
help="List of parent molecules of interest. Empty list or absence of "
"argument defaults to all parent molecules in input file.",
)
parser.add_argument(
"--pval",
type=float,
default=0.05,
help="p-value threshold for significant results. Must be between 0 and 1."
"Default is 0.05.",
)

return parser

def get_full_gene_list(ind_sep, infile):
# Gets a list of all molecules in the file
f = open(infile, 'r')
Expand All @@ -27,20 +84,26 @@ def get_full_gene_list(ind_sep, infile):
line = f.readline()
return genes


def get_values(gene, ind_sep, infile):
# Digs around in the input file for the specified gene
f = open(infile,'r')
values = {}
missings = {}
values = {} # only non-missing values
missings = {} # missing values
all_values = {} # all values, including missing and present
line = f.readline()
total = len(line.split())-1
while line:
if line.split(ind_sep)[0] == gene and len(line.split()) > 1:
values[line.split()[0]] = [float(num) for num in line.split()[1:]]
missings[line.split()[0]] = 1-(len(values[line.split()[0]])/float(total))
temp_vals = [float(num) for num in line.split()[1:]]
if len(temp_vals) > 1:
values[line.split()[0]] = [float(num) for num in line.split()[1:]]
missings[line.split()[0]] = 1-(len(values[line.split()[0]])/float(total))
all_values[line.split()[0]] = line.rstrip('\n').split('\t')[1:]
line = f.readline()

return values, missings
return values, missings, all_values


def outlier_thresholds(values, thresh):
# Fixes the value that is (threshold) IQR above the median for each phospho
Expand All @@ -52,6 +115,7 @@ def outlier_thresholds(values, thresh):
o_thresh[val] = outlier_threshold
return o_thresh


def simulate(values, missings, o_thresh, reps):
# Generates a random value for each p-site and counts outliers
# Does this (reps) number of times
Expand All @@ -70,6 +134,7 @@ def simulate(values, missings, o_thresh, reps):
ol_list.append(outliers)
return ol_list


def simulate_kde(values, missings, o_thresh, reps):
# Generates a KDE for each p-site from the existing values,
# generates a random value for each p-site from that KDE, and counts outliers
Expand All @@ -95,41 +160,73 @@ def simulate_kde(values, missings, o_thresh, reps):
ol_list.append(outliers)
return ol_list

def alpha_thresh(ol_dist):
# Spits out outlier (p<0.05) number of outliers
return np.percentile(ol_dist,95)

def alpha_thresh(ol_dist, pval):
# Spits out outlier number of outliers
pv = 100-(100*pval)
return np.percentile(ol_dist,pv)
#return np.mean(ol_dist)+1.64*np.std(ol_dist) # I think this is the 0.05, if you have a normal distribution, which you probably won't.

def run_simulations(infile, ind_sep, thresh, reps, outfile, genes):

def generate_output_line(o_thresh, values, ol_dist, pval):
# Determines the p-value for each sample being significantly hyperphosphorylated
# for the given gene.

output_list = []

for s in range(len(list(values.values())[0])): # for each sample
tot_outliers = 0
for o in o_thresh.keys():
if values[o][s]:
if float(values[o][s]) > o_thresh[o]:
tot_outliers += 1
pv = round(((100-percentileofscore(ol_dist,tot_outliers-1,kind='weak'))/100.0),3) # pval for i+1 outliers
if pv <= pval:
output_list.append(str(pv))
else:
output_list.append('NS')

return output_list

def run_simulations(infile, ind_sep, thresh, reps, outfile, genes, pval):
w = open(outfile+"_pvals.tsv", 'w')

# Write header line - not actually a thing
#f = open(infile, 'r')
#line = f.readline()
#w.write(line)
#f.close()
# writing header to file
f = open(infile, 'r')
w.write(f.readline())
f.close()

if len(genes) == 0:
genes = get_full_gene_list(ind_sep, infile)

for gene in genes: # Can make this much more efficient
# Get values for your gene from the input file
values, missings = get_values(gene, ind_sep, infile)
values, missings, all_values = get_values(gene, ind_sep, infile)
# Figure out the outlier threshold for each phosphosite
o_thresh = outlier_thresholds(values, thresh)
# Do the actual simulation
ol_dist = simulate_kde(values, missings, o_thresh, reps)
# Grab out the significance threshold for outliers
out_line = alpha_thresh(ol_dist)

out_line = alpha_thresh(ol_dist, pval)
# Write to the output file
w.write(gene)
if int(out_line) == len(values):
pass
else:
for i in range(int(out_line), len(values)):
#for i in range(int(out_line)+1,len(values)+1): This is how it used to be. I think I was wrong? If something downstream is broken, though, look here for a possible cause.
w.write('\t'+str(i+1)+'\t'+str(round(((100-percentileofscore(ol_dist,i,kind='weak'))/100.0),3)))
w.write('\n')
if len(all_values) > 0: # won't write it out if the gene has no phosphosites with more than one value
w.write(gene+'\t')
to_write = generate_output_line(o_thresh, all_values, ol_dist, pval)
w.write('\t'.join(to_write)+'\n')

w.close()

if __name__=='__main__':

args = sys.argv[1:]
args = _make_parser().parse_args(args)

run_simulations(
args.values,
args.ind_sep,
args.iqrs,
int(args.reps),
args.output_prefix,
args.molecules,
args.pval,
)

0 comments on commit cd09e57

Please sign in to comment.