Merge pull request #150 from robinandeer/nice-enduser-fixes

Nice enduser fixes
Clinical-Genomics · Sep 8, 2015 · db33d65 · db33d65
2 parents d478cbb + 04c7632
commit db33d65
Show file tree

Hide file tree

Showing 22 changed files with 404 additions and 332 deletions.
diff --git a/README.md b/README.md
@@ -10,8 +10,13 @@
 
 Chanjo is coverage analysis for clinical sequencing. It's implemented in Python with a command line interface that adheres to [UNIX pipeline philisophy][unix].
 
+## Whats new in Chanjo 3.0?
+Hey - exiting things are coming to the new version of Chanjo :smile:
+
+The primary change is [Sambamba][sambamba] integration. Just run `sambamba depth region` and load the output into Chanjo for further data exploration. Chanjo is now more flexible, accurate, and much easier to install. We have also built in some basic commands to quickly extract statistics from the database right from the command line.
+
 ## Installation
-Chanjo is distruibuted through "pip". Install the latest release by running:
+Chanjo is distruibuted through "pip". Install the latest stable release by running:
 
 ```bash
 $ pip install chanjo
@@ -33,8 +38,7 @@ Chanjo exposes a composable command line interface with a nifty config file impl
 $ chanjo init --setup
 $ chanjo load /path/to/sambamba.output.bed
 $ chanjo calculate mean sample1
-#sampleId	mean-coverage
-sample10	176.513223249
+{"metrics": {"completeness_10": 90.92, "mean_coverage": 193.85}, "sample_id": "sample1"}
 ```
 
 ## Documentation
@@ -56,6 +60,7 @@ Chanjo is not the right choice if you care about coverage for every base across
 - Robin Andeer
 - Luca Beltrame ([lbeltrame](https://github.com/lbeltrame))
 - John Kern ([kern3020](https://github.com/kern3020))
+- Måns Magnusson ([moonso](https://github.com/moonso))
 
 ## License
 MIT. See the [LICENSE](LICENSE) file for more details.
@@ -69,6 +74,7 @@ Anyone can help make this project better - read [CONTRIBUTION](CONTRIBUTION.md)
 [bedtools]: http://bedtools.readthedocs.org/en/latest/
 [thesis]: https://s3.amazonaws.com/tudo/chanjo/RobinAndeerMastersThesisFinal_2013.pdf
 [report]: https://github.com/robinandeer/chanjo-report
+[sambamba]: http://lomereiter.github.io/sambamba/
 
 [coveralls-url]: https://coveralls.io/r/robinandeer/chanjo
 [coveralls-image]: https://img.shields.io/coveralls/robinandeer/chanjo.svg?style=flat

diff --git a/chanjo/__init__.py b/chanjo/__init__.py
@@ -32,4 +32,4 @@
 __email__ = 'robin.andeer@gmail.com'
 
 __license__ = 'MIT'
-__copyright__ = 'Copyright 2014 Robin Andeer'
+__copyright__ = 'Copyright 2015 Robin Andeer'
diff --git a/chanjo/annotate/sambamba/run.py b/chanjo/annotate/sambamba/run.py
@@ -7,10 +7,7 @@
 
 
 def run_sambamba(bam_file, region_file, outfile=None, cov_treshold=()):
-    """
-    Run sambamba from chanjo.
-    
-    """
+    """Run sambamba from chanjo."""
     logger = logging.getLogger(__name__)
     logger = logging.getLogger("chanjo.sambamba")
     log_stream = get_log_stream(logger)
@@ -22,15 +19,14 @@ def run_sambamba(bam_file, region_file, outfile=None, cov_treshold=()):
         region_file,
         bam_file
     ]
-    
+
     if outfile:
         sambamba_call += ["-o", outfile]
-    
+
     for coverage_treshold in cov_treshold:
         sambamba_call += ['-T', str(coverage_treshold)]
-
-    logger.info("Running sambamba with call: {0}".format(' '.join(sambamba_call)))
-
+
+    logger.info("Running sambamba with call: %s", ' '.join(sambamba_call))
     try:
         subprocess.check_call(
             sambamba_call,
@@ -40,10 +36,9 @@ def run_sambamba(bam_file, region_file, outfile=None, cov_treshold=()):
         logger.critical("sambamba seems to not exist on your system.")
         raise e
     except CalledProcessError as e:
-        logger.critical("Something went wrong when running sambamba. "\
-        "Please see sambamba error output.")
+        logger.critical("Something went wrong when running sambamba. "
+                        "Please see sambamba error output.")
         raise e
-    
+
     logger.debug("sambamba run successfull")
-
     return
diff --git a/chanjo/cli/calculate.py b/chanjo/cli/calculate.py
@@ -4,6 +4,7 @@
 import click
 
 from chanjo.store import ChanjoAPI
+from chanjo.store.utils import filter_samples
 from .utils import dump_json
 
 logger = logging.getLogger(__name__)
@@ -21,7 +22,10 @@ def calculate(context):
 @click.pass_context
 def mean(context, samples):
     """Report mean coverage for a list of samples."""
-    results = context.parent.api.mean(*samples)
+    api = context.parent.api
+    query = filter_samples(api.query(), sample_ids=samples)
+    results = ({'sample_id': sample_id, 'metrics': data}
+               for sample_id, data in api.means(query))
     dump_json(*results)
 
 
@@ -49,7 +53,9 @@ def region(context, sample, per, chromosome, start, end):
         logger.debug('region id detected, parse string')
         results = api.region_alt(chromosome, sample_id=sample, per=per)
     else:
-        results = api.region(chromosome, start, end, sample_id=sample, per=per)
+        query = api.region(chromosome, start, end, sample_id=sample, per=per)
+        results = ({'exon_id': exon_id, 'metrics': data}
+                   for exon_id, data in query)
     if per == 'exon':
         dump_json(*results)
     else:

diff --git a/chanjo/cli/init.py b/chanjo/cli/init.py
@@ -18,8 +18,9 @@ def init(context, setup, reset, automate):
     click.echo(chanjo.__banner__)
 
     if not automate:
-        questions = [('annotate.cutoff', 'sufficient coverage',
-                      context.obj.get('annotate', {}).get('cutoff', 10)),
+        questions = [('sambamba.cov_treshold', 'sufficient coverage',
+                      context.obj.get('sambamba', {}).get('cov_treshold',
+                                                          [10, 20])),
                      ('database', 'central database path/URI',
                       context.obj['database'])]
         # launch init pipeline
@@ -47,5 +48,5 @@ def init_pipeline(program, config, questions):
     for dot_key, value in user_defaults.items():
         config.set(dot_key, value, scope=config.user_data)
 
-    # Write to the config file
+    # write to the config file
     config.save(default_flow_style=False)
diff --git a/chanjo/cli/load.py b/chanjo/cli/load.py
@@ -4,34 +4,36 @@
 import click
 from sqlalchemy.exc import IntegrityError
 
-from chanjo.load import sambamba
-from chanjo.parse import bed
+from chanjo.load.sambamba import rows as sambamba_rows
+from chanjo.parse import sambamba
 from chanjo.store import Store
 from chanjo.utils import validate_stdin
 
 logger = logging.getLogger(__name__)
 
 
 @click.command()
+@click.option('-s', '--sample', help='override sample id from file')
 @click.option('-g', '--group', help='id to group related samples')
 @click.argument('bed_stream', callback=validate_stdin,
                 type=click.File(encoding='utf-8'), default='-', required=False)
 @click.pass_context
-def load(context, group, bed_stream):
+def load(context, sample, group, bed_stream):
     """Load Sambamba output into the database for a sample."""
     chanjo_db = Store(uri=context.obj['database'])
     try:
-        load_sambamba(chanjo_db, bed_stream, group_id=group)
+        load_sambamba(chanjo_db, bed_stream, sample_id=sample, group_id=group)
     except IntegrityError:
         logger.error('sample already loaded, rolling back')
         chanjo_db.session.rollback()
         context.abort()
 
 
-def load_sambamba(chanjo_db, bed_iterable, group_id=None):
+def load_sambamba(chanjo_db, bed_iterable, sample_id=None, group_id=None):
     """Load Sambamba BED output from a stream."""
-    rows = bed.chanjo(bed_iterable)
-    stats = sambamba.rows(chanjo_db.session, rows, group_id=group_id)
+    rows = sambamba.depth_output(bed_iterable)
+    stats = sambamba_rows(chanjo_db.session, rows, sample_id=sample_id,
+                          group_id=group_id)
     for index, stat in enumerate(stats):
         chanjo_db.add(stat)
         if index % 10000 == 0:

diff --git a/chanjo/cli/root.py b/chanjo/cli/root.py
@@ -9,15 +9,14 @@
 """
 import click
 
-import chanjo
-
 from chanjo.compat import text_type
 from chanjo.config import Config, CONFIG_FILE_NAME, markup
 from chanjo.log import init_log, LEVELS
 from chanjo.utils import EntryPointsCLI
 
 from chanjo import __version__, logger
 
+
 def print_version(ctx, param, value):
     """Callback function for printing version and exiting
     Args:
@@ -33,21 +32,21 @@ def print_version(ctx, param, value):
     ctx.exit()
 
 @click.group(cls=EntryPointsCLI)
-@click.option('-c', '--config', 
-                default=CONFIG_FILE_NAME, 
-                type=click.Path(), 
+@click.option('-c', '--config',
+                default=CONFIG_FILE_NAME,
+                type=click.Path(),
                 help='path to config file'
 )
-@click.option('-d', '--database', 
+@click.option('-d', '--database',
                 type=text_type,
                 help='path/URI of the SQL database'
 )
-@click.option('-v', '--verbose', 
+@click.option('-v', '--verbose',
                 count=True,
                 default=0,
                 help="Increase output verbosity. Can be used multiple times, eg. -vv"
 )
-@click.option('--log_file', 
+@click.option('--log_file',
                 type=click.Path()
 )
 @click.option('--version',
@@ -60,10 +59,10 @@ def print_version(ctx, param, value):
 def root(context, config, database, verbose, log_file):
     """Clinical sequencing coverage analysis tool."""
     # setup logging
-    
-    loglevel = LEVELS.get(min(verbose,2), "WARNING")
+
+    loglevel = LEVELS.get(min(verbose, 2), "WARNING")
     init_log(logger, loglevel=loglevel, filename=log_file)
-    logger.info("version {0}".format( __version__))
+    logger.info("version {0}".format(__version__))
 
     # avoid setting global defaults in Click options, do it below when
     # updating the config object

diff --git a/chanjo/cli/sambamba.py b/chanjo/cli/sambamba.py
@@ -5,6 +5,7 @@
 
 from chanjo.annotate.sambamba import run_sambamba
 
+
 @click.command()
 @click.argument('bam_file',
                     type=click.Path(exists=True),
@@ -25,32 +26,31 @@
                      "the percentage of bases in the region"\
                      "where coverage is more than this value"
 )
-@click.option('-o', '--outfile', 
+@click.option('-o', '--outfile',
                     type=click.Path(exists=False),
                     help='Specify the path to a file where results should be stored.'
 )
 @click.pass_context
 def sambamba(context, bam_file, exon_bed, gene_bed, cov_treshold, outfile):
     """Run Sambamba from chanjo."""
     logger = logging.getLogger(__name__)
-    #For testing only:
+    # For testing only:
     logger = logging.getLogger("chanjo.cli.sambamba")
     logger.info("Running chanjo sambamba")
-    
+
     if not (exon_bed or gene_bed):
         logger.warning("Please provide a region file in BED format")
         sys.exit()
     if exon_bed and gene_bed:
         logger.warning("Only one region file at a time")
         sys.exit()
-    
+
     region_file = exon_bed
     if gene_bed:
         region_file = gene_bed
-    
+
     try:
         run_sambamba(bam_file, region_file, outfile, cov_treshold)
     except Exception as e:
         logger.debug(e)
         click.Abort()
-
diff --git a/chanjo/config/questions.py b/chanjo/config/questions.py
@@ -96,6 +96,9 @@ def ask(prompt, default=None, color='cyan'):
     # write default option in parentheses, use it as response if nothing
     # was submitted by user.
     response = input(build_prompt(prompt, default_string)) or default
+    if isinstance(default, list) and isinstance(response, str):
+        sep = ',' if ',' in response else None
+        response = [int(item) for item in response.split(sep)]
 
     # print the updated confirmation line by replacing the previous
     echo(MOVE_CURSOR_UP + ERASE_LINE

diff --git a/chanjo/load/sambamba.py b/chanjo/load/sambamba.py
@@ -7,24 +7,28 @@
 from .utils import get_or_build_exon
 
 
-def rows(session, row_data, group_id=None):
+def rows(session, row_data, sample_id=None, group_id=None):
     """Handle rows of sambamba output.
 
     N.B. only handles single sample annotations.
 
     Args:
         session (Session): database session object
         row_data (dict): parsed sambamba output rows
+        sample_id (Optional[str]): id to reference sample
         group_id (Optional[str]): id to group samples together
 
     Yields:
         ExonStatistic: stats model linked to exon and sample
     """
-    # use first row to get information on sample
-    first_row = next(iter(row_data))
-    sample_obj = sample(first_row, group_id=group_id)
-    # place the first row back in the stream
-    all_data = cons(first_row, row_data)
+    if sample_id is None:
+        # use first row to get information on sample
+        first_row = next(iter(row_data))
+        sample_id = first_row['sampleName']
+        # place the first row back in the stream
+        all_data = cons(first_row, row_data)
+
+    sample_obj = Sample(sample_id=sample_id, group_id=group_id)
     nested_stats = (row(session, data, sample_obj) for data in all_data)
     # flatten 2D nested list
     return (stat for stats in nested_stats for stat in stats)
@@ -46,20 +50,6 @@ def row(session, data, sample_obj):
     return stats
 
 
-def sample(data, group_id=None):
-    """Create sample model.
-
-    Args:
-        data (dict): parsed sambamba output row
-        group_id (Optional[str]): id to group samples together
-
-    Returns:
-        Sample: sample database model
-    """
-    sample_obj = Sample(sample_id=data['sampleName'], group=group_id)
-    return sample_obj
-
-
 def statistics(data, sample_obj, exon_obj):
     """Create models from a sambamba output row.
 

diff --git a/chanjo/load/utils.py b/chanjo/load/utils.py
@@ -11,7 +11,8 @@ def _exon_kwargs(data):
     Returns:
         dict: kwargs prepared for Exon model
     """
-    return {'exon_id': data['name'], 'chromosome': data['chrom'],
+    exon_id = data.get('name') or data['extraFields'][0]
+    return {'exon_id': exon_id, 'chromosome': data['chrom'],
             'start': data['chromStart'], 'end': data['chromEnd']}