Skip to content

Commit

Permalink
turn on default annotation clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
psathyrella committed Oct 12, 2020
1 parent a4196ea commit 8c7c9e0
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 34 deletions.
3 changes: 1 addition & 2 deletions bin/partis
Original file line number Diff line number Diff line change
Expand Up @@ -522,9 +522,8 @@ parent_parser.add_argument('--seed', type=int, default=int(time.time()), help='R
parent_parser.add_argument('--min-observations-per-gene', type=int, default=20, help='If a gene is observed fewer times than this, we average over other alleles/primary versions/etc. (e.g. in recombinator and hmmwriter). Also used as a more general "this isn\'t very many observations" value.')
parent_parser.add_argument('--no-per-base-mfreqs', action='store_true', help='When making the HMM model files, instead of the default of accounting for different rates to different bases (i.e. A->T vs A->G), do *not* account for the different rates to different bases. This is only really useful for testing the new simulation option --per-base-mutation.')
parent_parser.add_argument('--region-end-exclusion-length', type=int, default=0, help='when counting/writing parameters, ignore this many bases abutting non-templated insertions for calculating mutation frequencies (note: doesn\'t make a difference (hence set to 0 by default) probably because we\'re setting a strongish prior on these bases when writing hmms anyway')
# parent_parser.add_argument('--correct-multi-hmm-boundaries', action='store_true', help='Turn of multi-hmm boundary correction (see https://github.com/psathyrella/partis/issues/308). ')
parent_parser.add_argument('--allow-conserved-codon-deletion', action='store_true', help='When building hmm yaml model files during parameter caching, allow deletions that extend through the conserved codons (cyst in V and tryp/phen in J) (by default such deletions are forbidden; see https://github.com/psathyrella/partis/issues/308). NOTE that this has *no* effect if you\'ve already cached parameters.')
parent_parser.add_argument('--subcluster-annotation-size', type=int, help='when running the bcrham viterbi algorithm, instead of running the multi-hmm on the entire family, instead split the family into subclusters of (about) this size, then replace each subcluster with its naive sequence and iterate until running on once cluster of (about) this size consisting entirely of inferred naive/ancestor sequences.')
parent_parser.add_argument('--subcluster-annotation-size', type=int, default=3, help='when running the bcrham viterbi algorithm, instead of running the multi-hmm on the entire family, instead split the family into subclusters of (about) this size, then replace each subcluster with its naive sequence and iterate until running on once cluster of (about) this size consisting entirely of inferred naive/ancestor sequences.')

parent_parser.add_argument('--only-genes', help='Colon-separated list of genes to which to restrict the analysis. If any regions (V/D/J) are not represented among these genes, these regions are left unrestricted. If running \'simulate\', you probably also want to set --force-dont-generate-germline-set.')
parent_parser.add_argument('--n-max-per-region', default='3:5:2', help='Number of best smith-waterman matches (per region, in the format v:d:j) to pass on to the hmm')
Expand Down
67 changes: 38 additions & 29 deletions bin/run-fix-super-long-insertions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,49 +6,58 @@

import utils

label = 'subcluster-annotation-v1' # NOTE already made -v2
label = 'subcluster-annotation-v5'
bdir = '%s/partis/fix-super-long-insertions/%s' % ('/fh/local/dralph', label) # os.getenv('fs')
n_trees = 500 #300 #10 #50
n_trees = 300 #500 #10 #50
n_procs = 25 #10
ntseq = 10000 #3000 #
mmstr = '--mutation-multiplier 2' #7' #1' # 10
ntseq = 3000 #10000 #
mmstr = '--mutation-multiplier 7' #7' #1' # 10

dryrun = False #True

# for npg, npgvar in [(50, 98), ]: #[(50, 98), (300, 598)]:
# outdir = '%s/npg-%d-width-%d' % (bdir, npg, npgvar)
# common = '--is-simu --infname %s/simu.yaml --parameter-dir %s/parameters --only-csv-plots --n-procs %d' % (outdir, outdir, n_procs)
# cmd = './bin/bcr-phylo-run.py --base-outdir %s/bcr-phylo --actions simu --n-sim-seqs-per-generation %d --carry-cap 1000 --obs-times 100 --n-sim-events %d --n-procs %d --only-csv-plots' % (outdir, npg, n_trees, n_procs)
# cmd += ' --parameter-variances n-sim-seqs-per-generation,%d' % npgvar
# # utils.simplerun(cmd, logfname='%s/bcr-phylo.log'%outdir, dryrun=dryrun)
# # cmd = 'cat %s/bcr-phylo/selection/simu/event-*/simu.nwk |sed \'s/;/;\\n/g\' >%s/all-trees.nwk' % (outdir, outdir)
# # utils.simplerun(cmd, shell=True, dryrun=dryrun) #, logfname='%s/tree-cat.log'%outdir)
# # cmd = './bin/partis simulate --n-sim-events %d --outfname %s/simu.yaml --simulate-from-scratch --mutate-conserved-codons %s --input-simulation-treefname %s/all-trees.nwk --n-procs %d' % (ntseq / npg, outdir, mmstr, outdir, n_procs)
# # utils.simplerun(cmd, logfname='%s/simulate.log'%outdir, dryrun=dryrun)
# # cmd = './bin/partis cache-parameters %s --plotdir %s/parameter-plots' % (common, outdir)
# # utils.simplerun(cmd, logfname='%s/cache-parameters.log'%(outdir), dryrun=dryrun)
# for use_kmean in [False, True]:
# for acsize in [None, 3, 5, 10]: #[None, 2, 3, 5, 10, 15]:
# bstr = 'subc-%s_kmean-%s' % (acsize, use_kmean)
# # for npg, npgvar in [(5, 2), ]: #[(50, 98), (300, 598)]:
# npgvar = None
# for use_bcr_phylo in [False]: #[True, False]:
# for npg in [50]:
# # outdir = '%s/npg-%d-width-%d' % (bdir, npg, npgvar)
# outdir = '%s/npg-%d-bcrp-%s' % (bdir, npg, use_bcr_phylo)
# common = '--is-simu --infname %s/simu.yaml --parameter-dir %s/parameters --only-csv-plots --n-procs %d' % (outdir, outdir, n_procs)
# if use_bcr_phylo:
# cmd = './bin/bcr-phylo-run.py --base-outdir %s/bcr-phylo --actions simu --n-sim-seqs-per-generation %d --carry-cap 1000 --obs-times 100 --n-sim-events %d --n-procs %d --only-csv-plots' % (outdir, npg, n_trees, n_procs)
# # cmd += ' --parameter-variances n-sim-seqs-per-generation,%d' % npgvar
# utils.simplerun(cmd, logfname='%s/bcr-phylo.log'%outdir, dryrun=dryrun)
# cmd = 'cat %s/bcr-phylo/selection/simu/event-*/simu.nwk |sed \'s/;/;\\n/g\' >%s/all-trees.nwk' % (outdir, outdir)
# utils.simplerun(cmd, shell=True, dryrun=dryrun) #, logfname='%s/tree-cat.log'%outdir)
# cmd = './bin/partis simulate --n-sim-events %d --outfname %s/simu.yaml --simulate-from-scratch --mutate-conserved-codons %s --n-procs %d' % (ntseq / npg, outdir, mmstr, n_procs)
# if use_bcr_phylo:
# cmd += ' --input-simulation-treefname %s/all-trees.nwk' % outdir
# else:
# assert npgvar is None
# cmd += ' --constant-number-of-leaves'
# utils.simplerun(cmd, logfname='%s/simulate.log'%outdir, dryrun=dryrun)
# cmd = './bin/partis cache-parameters %s --plotdir %s/parameter-plots' % (common, outdir)
# utils.simplerun(cmd, logfname='%s/cache-parameters.log'%(outdir), dryrun=dryrun)
# for acsize in [3, 999]: #[None, 3, 5, 10]:
# bstr = 'subc-%s' % (acsize)
# cmd = './bin/partis annotate --simultaneous-true-clonal-seqs --plot-annotation-performance %s --outfname %s/%s/annotations.yaml --plotdir %s/%s/plots' % (common, outdir, bstr, outdir, bstr)
# if acsize is not None:
# cmd += ' --subcluster-annotation-size %d' % acsize
# if use_kmean:
# cmd += ' --kmeans-subclusters'
# utils.simplerun(cmd, logfname='%s/annotate-%s.log'%(outdir, bstr), dryrun=dryrun)
# sys.exit()

npg = 50; npgvar = 98
# npg = 300; npgvar = 598
p_strs = ['subc-%s'%sc for sc in ['None', 10, 5, 3]]
b_strs = ['kmean-False'] #, 'kmean-True']
xtrastr = '-kmean-False'
# npg = 5; npgvar = 2
npg = 50
p_strs = ['subc-%s'%sc for sc in [999, 3]] #['None', 10, 5, 3]]
b_strs = ['bcrp-False'] #, 'bcrp-True']
xtrastr = '' #'-ceil' #'-' + str(b_strs[0])
# pt=mute-freqs/overall; subd=parameter-plots/true
subd = 'plots/hmm'
for pt in ['gene-call', 'boundaries', 'mutation']:
cmd = './bin/compare-plotdirs.py --outdir %s/cf-plots/npg-%d-width-%d%s/%s --normalize --performance-plots --extra-stats absmean --translegend=-0.6:-0.1 ' % (bdir.replace('/fh/local', '/fh/fast/matsen_e'), npg, npgvar, xtrastr, pt)
names = ['%s%s%s'%(p, '' if b=='' else '_', b) for p in p_strs for b in b_strs]
plotdirs = ['%s/npg-%d-width-%d/%s%s%s/%s/%s' % (bdir, npg, npgvar, p, '' if b=='' else '_', b, subd, pt) for p in p_strs for b in b_strs]
npgstr = 'npg-%d%s' % (npg, xtrastr)
cmd = './bin/compare-plotdirs.py --outdir %s/cf-plots/%s/%s --normalize --performance-plots --extra-stats absmean --translegend=-0.6:-0.1 ' % (bdir.replace('/fh/local', '/fh/fast/matsen_e'), npgstr, pt)
names = ['%s%s%s'%(p, '' if b=='' else '-', b) for p in p_strs for b in b_strs]
# plotdirs = ['%s/%s/%s%s%s/%s/%s' % (bdir, npgstr, b, p, '' if b=='' else '-', b, subd, pt) for p in p_strs for b in b_strs]
plotdirs = ['%s/%s-%s/%s/%s/%s' % (bdir, npgstr, b, p, subd, pt) for p in p_strs for b in b_strs]
cmd += ' --names %s --plotdirs %s' % (':'.join(names), ':'.join(plotdirs))
utils.simplerun(cmd)

Expand Down
8 changes: 5 additions & 3 deletions python/partitiondriver.py
Original file line number Diff line number Diff line change
Expand Up @@ -1095,7 +1095,8 @@ def get_cmd_str(iproc): # all this does at this point is replace workdir with s

# ----------------------------------------------------------------------------------------
def subcl_split(self, n_seqs): # return true if we want to split cluster of size <n_seqs> into subclusters for purposes of annotation accuracy
return n_seqs >= 2 * self.args.subcluster_annotation_size # don't subclusterify unless there's really enough for two clusters
return n_seqs > self.args.subcluster_annotation_size
# return n_seqs >= 2 * self.args.subcluster_annotation_size # used to do it this way, and there's pluses and minuses to both, but it turns out it's better to split smaller clusters

# ----------------------------------------------------------------------------------------
def run_subcluster_annotate(self, init_partition, parameter_in_dir, n_procs, count_parameters=False, parameter_out_dir='', dont_print_annotations=False, debug=True): # NOTE nothing to do with subcluster naive seqs above
Expand All @@ -1112,12 +1113,13 @@ def getsubclusters(superclust, shuffle=False, sdbg=False): # NOTE similar to co
superclust = copy.deepcopy(superclust) # should only need this if we shuffle, but you *really* don't want to modify it since if you change the clusters in <init_partition> you'll screw up the actual partition, and if you change the ones in <clusters_still_to_do> you'll end up losing them cause the uidstrs won't be right
if shuffle:
random.shuffle(superclust)
n_clusters = len(superclust) / self.args.subcluster_annotation_size # integer division truncates the decimal
# n_clusters = len(superclust) / self.args.subcluster_annotation_size # old way: truncates the decimal (see note in subcl_split())
n_clusters = int(math.ceil(len(superclust) / float(self.args.subcluster_annotation_size))) # taking the ceiling keeps the max un-split cluster size equal to <self.args.subcluster_annotation_size> (rather than one less than twice that)
if n_clusters < 2: # initially we don't call this function unless superclust is big enough for two clusters of size self.args.subcluster_annotation_size, but on subsequent rounds the clusters fom naive_ancestor_hashes can be smaller than that
return [superclust]

if False: # self.args.kmeans_subclusters: # this gives you clusters that are "tighter" -- i.e. clusters similar sequences together
import mds # this works fine, but it's not really different (sometimes a bit better, sometimes a bit worse) than the simple way. Which is weird, I would think it would help? but otoh it gives you non-equal-sized clusters, which sometimes i think is worse, although sometimes i also think is better
import mds # this works fine, but it's not really different (on balance with kmeans is probably a bit worse) than the simple way. Which is weird, I would think it would help? but otoh it gives you non-equal-sized clusters, which sometimes i think is worse, although sometimes i also think is better
seqfos = [{'name' : u, 'seq' : self.sw_info[u]['seqs'][0]} for u in superclust]
return_clusts = mds.run_sklearn_mds(None, n_clusters, seqfos, self.args.seed, aligned=True)
else:
Expand Down

0 comments on commit 8c7c9e0

Please sign in to comment.