Merge branch 'dev'

adding simulations working and reformatted
ruggleslab · Oct 24, 2019 · cd09e57 · cd09e57
2 parents 85a8a45 + 4827f08
commit cd09e57
Show file tree

Hide file tree

Showing 2 changed files with 132 additions and 27 deletions.
diff --git a/blacksheep/cli.py b/blacksheep/cli.py
@@ -482,7 +482,7 @@ def _make_parser():
         "--output_prefix",
         type=_check_output_prefix,
         default="simulated_pvals",
-        help="Output prefix for writing files. Default simulated_pvals. ",
+        help="Output prefix for writing files. Default is 'simulated_pvals'.",
     )
     simulations.add_argument(
         "--molecules",
@@ -491,6 +491,13 @@ def _make_parser():
         help="List of parent molecules of interest. Empty list or absence of "
              "argument defaults to all parent molecules in input file.",
     )
+    simulations.add_argument(
+        "--pval",
+        type=_bn0and1,
+        default=0.05,
+        help="p-value threshold for significant results. Must be between 0 and 1."
+        "Default is 0.05.",
+    )
 
     return parser
 
@@ -642,6 +649,7 @@ def _main(args: Optional[List[str]] = None):
             int(args.reps),
             args.output_prefix,
             args.molecules,
+            args.pval,
         )
 
     with open(parameters_file_name % args.output_prefix, "w") as fh:

diff --git a/blacksheep/simulate.py b/blacksheep/simulate.py
@@ -11,11 +11,68 @@
 '''
 
 import random
+import sys
+import argparse
 import numpy as np
 from scipy.stats import ttest_1samp
 from scipy.stats import percentileofscore
 from scipy.stats import gaussian_kde
 
+# Argparser, when testing on its own
+def _make_parser():
+    parser = argparse.ArgumentParser(prog="blacksheep", description="")
+    parser.add_argument("--version", "-v", action="version", version="%(prog)s 0.0.1")
+
+    parser.add_argument(
+        "values",
+        type=str,
+        help="File path to input values. Samples are columns and genes/sites are rows. Only .tsv "
+             "and .csv accepted.",
+    )
+    parser.add_argument(
+        "--ind_sep",
+        type=str,
+        default="-",
+        help="Delimiter between the parent molecule (e.g. a gene name such "
+             "as ATM) and a site identifier (e.g. S365). Default is -",
+    )
+    parser.add_argument(
+        "--iqrs",
+        type=float,
+        default=1.5,
+        help="Number of inter-quartile ranges (IQRs) above or below the "
+             "median to consider a value an outlier. Default is 1.5.",
+    )
+    parser.add_argument(
+        "--reps",
+        type=int,
+        default=1000000,
+        help="Number of repetitions for the simulation to perform. "
+             "Default is 1,000,000.",
+    )
+    parser.add_argument(
+        "--output_prefix",
+        type=str,
+        default="simulated_pvals",
+        help="Output prefix for writing files. Default is 'simulated_pvals'.",
+    )
+    parser.add_argument(
+        "--molecules",
+        nargs='+',
+        default=[],
+        help="List of parent molecules of interest. Empty list or absence of "
+             "argument defaults to all parent molecules in input file.",
+    )
+    parser.add_argument(
+        "--pval",
+        type=float,
+        default=0.05,
+        help="p-value threshold for significant results. Must be between 0 and 1."
+        "Default is 0.05.",
+    )
+
+    return parser
+
 def get_full_gene_list(ind_sep, infile):
 	# Gets a list of all molecules in the file
 	f = open(infile, 'r')
@@ -27,20 +84,26 @@ def get_full_gene_list(ind_sep, infile):
 		line = f.readline()
 	return genes
 
+
 def get_values(gene, ind_sep, infile):
 	# Digs around in the input file for the specified gene
 	f = open(infile,'r')
-	values = {}
-	missings = {}
+	values = {} # only non-missing values
+	missings = {} # missing values
+	all_values = {} # all values, including missing and present
 	line = f.readline()
 	total = len(line.split())-1
 	while line:
 		if line.split(ind_sep)[0] == gene and len(line.split()) > 1:
-			values[line.split()[0]] = [float(num) for num in line.split()[1:]]
-			missings[line.split()[0]] = 1-(len(values[line.split()[0]])/float(total))
+			temp_vals = [float(num) for num in line.split()[1:]]
+			if len(temp_vals) > 1:
+				values[line.split()[0]] = [float(num) for num in line.split()[1:]]
+				missings[line.split()[0]] = 1-(len(values[line.split()[0]])/float(total))
+				all_values[line.split()[0]] = line.rstrip('\n').split('\t')[1:]
 		line = f.readline()
 
-	return values, missings
+	return values, missings, all_values
+
 
 def outlier_thresholds(values, thresh):
 	# Fixes the value that is (threshold) IQR above the median for each phospho
@@ -52,6 +115,7 @@ def outlier_thresholds(values, thresh):
 		o_thresh[val] = outlier_threshold
 	return o_thresh
 
+
 def simulate(values, missings, o_thresh, reps):
 	# Generates a random value for each p-site and counts outliers
 	# Does this (reps) number of times
@@ -70,6 +134,7 @@ def simulate(values, missings, o_thresh, reps):
 		ol_list.append(outliers)
 	return ol_list
 
+
 def simulate_kde(values, missings, o_thresh, reps):
 	# Generates a KDE for each p-site from the existing values,
 	# generates a random value for each p-site from that KDE, and counts outliers
@@ -95,41 +160,73 @@ def simulate_kde(values, missings, o_thresh, reps):
 		ol_list.append(outliers)
 	return ol_list
 
-def alpha_thresh(ol_dist):
-	# Spits out outlier (p<0.05) number of outliers
-	return np.percentile(ol_dist,95)
+
+def alpha_thresh(ol_dist, pval):
+	# Spits out outlier number of outliers
+	pv = 100-(100*pval)
+	return np.percentile(ol_dist,pv)
 	#return np.mean(ol_dist)+1.64*np.std(ol_dist) # I think this is the 0.05, if you have a normal distribution, which you probably won't. 
-
-def run_simulations(infile, ind_sep, thresh, reps, outfile, genes):
 
+def generate_output_line(o_thresh, values, ol_dist, pval):
+	# Determines the p-value for each sample being significantly hyperphosphorylated
+	# for the given gene.
+
+	output_list = []
+
+	for s in range(len(list(values.values())[0])): # for each sample
+		tot_outliers = 0
+		for o in o_thresh.keys():
+			if values[o][s]:
+				if float(values[o][s]) > o_thresh[o]:
+					tot_outliers += 1
+		pv = round(((100-percentileofscore(ol_dist,tot_outliers-1,kind='weak'))/100.0),3) # pval for i+1 outliers
+		if pv <= pval:
+			output_list.append(str(pv))
+		else:
+			output_list.append('NS')
+
+	return output_list
+
+def run_simulations(infile, ind_sep, thresh, reps, outfile, genes, pval):
 	w = open(outfile+"_pvals.tsv", 'w')
 
-	# Write header line - not actually a thing
-	#f = open(infile, 'r')
-	#line = f.readline()
-	#w.write(line)
-	#f.close()
+	# writing header to file
+	f = open(infile, 'r')
+	w.write(f.readline())
+	f.close()
 
 	if len(genes) == 0:
 		genes = get_full_gene_list(ind_sep, infile)
 
 	for gene in genes: # Can make this much more efficient
 		# Get values for your gene from the input file
-		values, missings = get_values(gene, ind_sep, infile)
+		values, missings, all_values = get_values(gene, ind_sep, infile)
 		# Figure out the outlier threshold for each phosphosite
 		o_thresh = outlier_thresholds(values, thresh)
 		# Do the actual simulation
 		ol_dist = simulate_kde(values, missings, o_thresh, reps)
 		# Grab out the significance threshold for outliers
-		out_line = alpha_thresh(ol_dist)
-
+		out_line = alpha_thresh(ol_dist, pval)
+		
 		# Write to the output file
-		w.write(gene)
-		if int(out_line) == len(values):
-			pass
-		else:
-			for i in range(int(out_line), len(values)):
-			#for i in range(int(out_line)+1,len(values)+1): This is how it used to be. I think I was wrong? If something downstream is broken, though, look here for a possible cause.
-				w.write('\t'+str(i+1)+'\t'+str(round(((100-percentileofscore(ol_dist,i,kind='weak'))/100.0),3)))
-		w.write('\n')
+		if len(all_values) > 0: # won't write it out if the gene has no phosphosites with more than one value
+			w.write(gene+'\t')
+			to_write = generate_output_line(o_thresh, all_values, ol_dist, pval)
+			w.write('\t'.join(to_write)+'\n')
+
 	w.close()
+
+if __name__=='__main__':
+
+	args = sys.argv[1:]
+	args = _make_parser().parse_args(args) 
+
+	run_simulations(
+            args.values,
+            args.ind_sep,
+            args.iqrs,
+            int(args.reps),
+            args.output_prefix,
+            args.molecules,
+            args.pval,
+        )