turn on default annotation clustering

psathyrella · Oct 12, 2020 · 8c7c9e0 · 8c7c9e0
1 parent a4196ea
commit 8c7c9e0
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 34 deletions.
diff --git a/bin/partis b/bin/partis
@@ -522,9 +522,8 @@ parent_parser.add_argument('--seed', type=int, default=int(time.time()), help='R
 parent_parser.add_argument('--min-observations-per-gene', type=int, default=20, help='If a gene is observed fewer times than this, we average over other alleles/primary versions/etc. (e.g. in recombinator and hmmwriter). Also used as a more general "this isn\'t very many observations" value.')
 parent_parser.add_argument('--no-per-base-mfreqs', action='store_true', help='When making the HMM model files, instead of the default of accounting for different rates to different bases (i.e. A->T vs A->G), do *not* account for the different rates to different bases. This is only really useful for testing the new simulation option --per-base-mutation.')
 parent_parser.add_argument('--region-end-exclusion-length', type=int, default=0, help='when counting/writing parameters, ignore this many bases abutting non-templated insertions for calculating mutation frequencies (note: doesn\'t make a difference (hence set to 0 by default) probably because we\'re setting a strongish prior on these bases when writing hmms anyway')
-# parent_parser.add_argument('--correct-multi-hmm-boundaries', action='store_true', help='Turn of multi-hmm boundary correction (see https://github.com/psathyrella/partis/issues/308). ')
 parent_parser.add_argument('--allow-conserved-codon-deletion', action='store_true', help='When building hmm yaml model files during parameter caching, allow deletions that extend through the conserved codons (cyst in V and tryp/phen in J) (by default such deletions are forbidden; see https://github.com/psathyrella/partis/issues/308). NOTE that this has *no* effect if you\'ve already cached parameters.')
-parent_parser.add_argument('--subcluster-annotation-size', type=int, help='when running the bcrham viterbi algorithm, instead of running the multi-hmm on the entire family, instead split the family into subclusters of (about) this size, then replace each subcluster with its naive sequence and iterate until running on once cluster of (about) this size consisting entirely of inferred naive/ancestor sequences.')
+parent_parser.add_argument('--subcluster-annotation-size', type=int, default=3, help='when running the bcrham viterbi algorithm, instead of running the multi-hmm on the entire family, instead split the family into subclusters of (about) this size, then replace each subcluster with its naive sequence and iterate until running on once cluster of (about) this size consisting entirely of inferred naive/ancestor sequences.')
 
 parent_parser.add_argument('--only-genes', help='Colon-separated list of genes to which to restrict the analysis. If any regions (V/D/J) are not represented among these genes, these regions are left unrestricted. If running \'simulate\', you probably also want to set --force-dont-generate-germline-set.')
 parent_parser.add_argument('--n-max-per-region', default='3:5:2', help='Number of best smith-waterman matches (per region, in the format v:d:j) to pass on to the hmm')

diff --git a/bin/run-fix-super-long-insertions.py b/bin/run-fix-super-long-insertions.py
@@ -6,49 +6,58 @@
 
 import utils
 
-label = 'subcluster-annotation-v1'  # NOTE already made -v2
+label = 'subcluster-annotation-v5'
 bdir = '%s/partis/fix-super-long-insertions/%s' % ('/fh/local/dralph', label) # os.getenv('fs')
-n_trees = 500 #300 #10 #50
+n_trees = 300 #500 #10 #50
 n_procs = 25 #10
-ntseq = 10000 #3000 #
-mmstr = '--mutation-multiplier 2' #7' #1' # 10
+ntseq = 3000 #10000 #
+mmstr = '--mutation-multiplier 7' #7' #1' # 10
 
 dryrun = False #True
 
-# for npg, npgvar in [(50, 98), ]: #[(50, 98), (300, 598)]:
-#     outdir = '%s/npg-%d-width-%d' % (bdir, npg, npgvar)
-#     common = '--is-simu --infname %s/simu.yaml --parameter-dir %s/parameters --only-csv-plots --n-procs %d' % (outdir, outdir, n_procs)
-#     cmd = './bin/bcr-phylo-run.py --base-outdir %s/bcr-phylo --actions simu --n-sim-seqs-per-generation %d --carry-cap 1000 --obs-times 100 --n-sim-events %d --n-procs %d --only-csv-plots' % (outdir, npg, n_trees, n_procs)
-#     cmd += ' --parameter-variances n-sim-seqs-per-generation,%d' % npgvar
-#     # utils.simplerun(cmd, logfname='%s/bcr-phylo.log'%outdir, dryrun=dryrun)
-#     # cmd = 'cat %s/bcr-phylo/selection/simu/event-*/simu.nwk |sed \'s/;/;\\n/g\' >%s/all-trees.nwk' % (outdir, outdir)
-#     # utils.simplerun(cmd, shell=True, dryrun=dryrun) #, logfname='%s/tree-cat.log'%outdir)
-#     # cmd = './bin/partis simulate --n-sim-events %d --outfname %s/simu.yaml --simulate-from-scratch --mutate-conserved-codons %s --input-simulation-treefname %s/all-trees.nwk --n-procs %d' % (ntseq / npg, outdir, mmstr, outdir, n_procs)
-#     # utils.simplerun(cmd, logfname='%s/simulate.log'%outdir, dryrun=dryrun)
-#     # cmd = './bin/partis cache-parameters %s --plotdir %s/parameter-plots' % (common, outdir)
-#     # utils.simplerun(cmd, logfname='%s/cache-parameters.log'%(outdir), dryrun=dryrun)
-#     for use_kmean in [False, True]:
-#         for acsize in [None, 3, 5, 10]: #[None, 2, 3, 5, 10, 15]:
-#             bstr = 'subc-%s_kmean-%s' % (acsize, use_kmean)
+# # for npg, npgvar in [(5, 2), ]: #[(50, 98), (300, 598)]:
+# npgvar = None
+# for use_bcr_phylo in [False]: #[True, False]:
+#     for npg in [50]:
+#         # outdir = '%s/npg-%d-width-%d' % (bdir, npg, npgvar)
+#         outdir = '%s/npg-%d-bcrp-%s' % (bdir, npg, use_bcr_phylo)
+#         common = '--is-simu --infname %s/simu.yaml --parameter-dir %s/parameters --only-csv-plots --n-procs %d' % (outdir, outdir, n_procs)
+#         if use_bcr_phylo:
+#             cmd = './bin/bcr-phylo-run.py --base-outdir %s/bcr-phylo --actions simu --n-sim-seqs-per-generation %d --carry-cap 1000 --obs-times 100 --n-sim-events %d --n-procs %d --only-csv-plots' % (outdir, npg, n_trees, n_procs)
+#             # cmd += ' --parameter-variances n-sim-seqs-per-generation,%d' % npgvar
+#             utils.simplerun(cmd, logfname='%s/bcr-phylo.log'%outdir, dryrun=dryrun)
+#             cmd = 'cat %s/bcr-phylo/selection/simu/event-*/simu.nwk |sed \'s/;/;\\n/g\' >%s/all-trees.nwk' % (outdir, outdir)
+#             utils.simplerun(cmd, shell=True, dryrun=dryrun) #, logfname='%s/tree-cat.log'%outdir)
+#         cmd = './bin/partis simulate --n-sim-events %d --outfname %s/simu.yaml --simulate-from-scratch --mutate-conserved-codons %s --n-procs %d' % (ntseq / npg, outdir, mmstr, n_procs)
+#         if use_bcr_phylo:
+#             cmd += ' --input-simulation-treefname %s/all-trees.nwk' % outdir
+#         else:
+#             assert npgvar is None
+#             cmd += ' --constant-number-of-leaves'
+#         utils.simplerun(cmd, logfname='%s/simulate.log'%outdir, dryrun=dryrun)
+#         cmd = './bin/partis cache-parameters %s --plotdir %s/parameter-plots' % (common, outdir)
+#         utils.simplerun(cmd, logfname='%s/cache-parameters.log'%(outdir), dryrun=dryrun)
+#         for acsize in [3, 999]: #[None, 3, 5, 10]:
+#             bstr = 'subc-%s' % (acsize)
 #             cmd = './bin/partis annotate --simultaneous-true-clonal-seqs --plot-annotation-performance %s --outfname %s/%s/annotations.yaml --plotdir %s/%s/plots' % (common, outdir, bstr, outdir, bstr)
 #             if acsize is not None:
 #                 cmd += ' --subcluster-annotation-size %d' % acsize
-#             if use_kmean:
-#                 cmd += ' --kmeans-subclusters'
 #             utils.simplerun(cmd, logfname='%s/annotate-%s.log'%(outdir, bstr), dryrun=dryrun)
 # sys.exit()
 
-npg = 50; npgvar = 98
-# npg = 300; npgvar = 598
-p_strs = ['subc-%s'%sc for sc in ['None', 10, 5, 3]]
-b_strs = ['kmean-False'] #, 'kmean-True']
-xtrastr = '-kmean-False'
+# npg = 5; npgvar = 2
+npg = 50
+p_strs = ['subc-%s'%sc for sc in [999, 3]] #['None', 10, 5, 3]]
+b_strs = ['bcrp-False'] #, 'bcrp-True']
+xtrastr = '' #'-ceil' #'-' + str(b_strs[0])
 # pt=mute-freqs/overall; subd=parameter-plots/true
 subd = 'plots/hmm'
 for pt in ['gene-call', 'boundaries', 'mutation']:
-    cmd = './bin/compare-plotdirs.py --outdir %s/cf-plots/npg-%d-width-%d%s/%s --normalize --performance-plots --extra-stats absmean --translegend=-0.6:-0.1 ' % (bdir.replace('/fh/local', '/fh/fast/matsen_e'), npg, npgvar, xtrastr, pt)
-    names = ['%s%s%s'%(p, '' if b=='' else '_', b) for p in p_strs for b in b_strs]
-    plotdirs = ['%s/npg-%d-width-%d/%s%s%s/%s/%s' % (bdir, npg, npgvar, p, '' if b=='' else '_', b, subd, pt) for p in p_strs for b in b_strs]
+    npgstr = 'npg-%d%s' % (npg, xtrastr)
+    cmd = './bin/compare-plotdirs.py --outdir %s/cf-plots/%s/%s --normalize --performance-plots --extra-stats absmean --translegend=-0.6:-0.1 ' % (bdir.replace('/fh/local', '/fh/fast/matsen_e'), npgstr, pt)
+    names = ['%s%s%s'%(p, '' if b=='' else '-', b) for p in p_strs for b in b_strs]
+    # plotdirs = ['%s/%s/%s%s%s/%s/%s' % (bdir, npgstr, b, p, '' if b=='' else '-', b, subd, pt) for p in p_strs for b in b_strs]
+    plotdirs = ['%s/%s-%s/%s/%s/%s' % (bdir, npgstr, b, p, subd, pt) for p in p_strs for b in b_strs]
     cmd += ' --names %s --plotdirs %s' % (':'.join(names), ':'.join(plotdirs))
     utils.simplerun(cmd)
 

diff --git a/python/partitiondriver.py b/python/partitiondriver.py
@@ -1095,7 +1095,8 @@ def get_cmd_str(iproc):  # all this does at this point is replace workdir with s
 
     # ----------------------------------------------------------------------------------------
     def subcl_split(self, n_seqs):  # return true if we want to split cluster of size <n_seqs> into subclusters for purposes of annotation accuracy
-        return n_seqs >= 2 * self.args.subcluster_annotation_size  # don't subclusterify unless there's really enough for two clusters
+        return n_seqs > self.args.subcluster_annotation_size
+        # return n_seqs >= 2 * self.args.subcluster_annotation_size  # used to do it this way, and there's pluses and minuses to both, but it turns out it's better to split smaller clusters
 
     # ----------------------------------------------------------------------------------------
     def run_subcluster_annotate(self, init_partition, parameter_in_dir, n_procs, count_parameters=False, parameter_out_dir='', dont_print_annotations=False, debug=True):  # NOTE nothing to do with subcluster naive seqs above
@@ -1112,12 +1113,13 @@ def getsubclusters(superclust, shuffle=False, sdbg=False):  # NOTE similar to co
             superclust = copy.deepcopy(superclust)  # should only need this if we shuffle, but you *really* don't want to modify it since if you change the clusters in <init_partition> you'll screw up the actual partition, and if you change the ones in <clusters_still_to_do> you'll end up losing them cause the uidstrs won't be right
             if shuffle:
                 random.shuffle(superclust)
-            n_clusters = len(superclust) / self.args.subcluster_annotation_size  # integer division truncates the decimal
+            # n_clusters = len(superclust) / self.args.subcluster_annotation_size  # old way: truncates the decimal (see note in subcl_split())
+            n_clusters = int(math.ceil(len(superclust) / float(self.args.subcluster_annotation_size)))  # taking the ceiling keeps the max un-split cluster size equal to <self.args.subcluster_annotation_size> (rather than one less than twice that)
             if n_clusters < 2:  # initially we don't call this function unless superclust is big enough for two clusters of size self.args.subcluster_annotation_size, but on subsequent rounds the clusters fom naive_ancestor_hashes can be smaller than that
                 return [superclust]
 
             if False: # self.args.kmeans_subclusters:  # this gives you clusters that are "tighter" -- i.e. clusters similar sequences together
-                import mds  # this works fine, but it's not really different (sometimes a bit better, sometimes a bit worse) than the simple way. Which is weird, I would think it would help? but otoh it gives you non-equal-sized clusters, which sometimes i think is worse, although sometimes i also think is better
+                import mds  # this works fine, but it's not really different (on balance with kmeans is probably a bit worse) than the simple way. Which is weird, I would think it would help? but otoh it gives you non-equal-sized clusters, which sometimes i think is worse, although sometimes i also think is better
                 seqfos = [{'name' : u, 'seq' : self.sw_info[u]['seqs'][0]} for u in superclust]
                 return_clusts = mds.run_sklearn_mds(None, n_clusters, seqfos, self.args.seed, aligned=True)
             else: