Skip to content

Commit

Permalink
Merge pull request #150 from robinandeer/nice-enduser-fixes
Browse files Browse the repository at this point in the history
Nice enduser fixes
  • Loading branch information
robinandeer committed Sep 8, 2015
2 parents d478cbb + 04c7632 commit db33d65
Show file tree
Hide file tree
Showing 22 changed files with 404 additions and 332 deletions.
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,13 @@

Chanjo is coverage analysis for clinical sequencing. It's implemented in Python with a command line interface that adheres to [UNIX pipeline philisophy][unix].

## Whats new in Chanjo 3.0?
Hey - exiting things are coming to the new version of Chanjo :smile:

The primary change is [Sambamba][sambamba] integration. Just run `sambamba depth region` and load the output into Chanjo for further data exploration. Chanjo is now more flexible, accurate, and much easier to install. We have also built in some basic commands to quickly extract statistics from the database right from the command line.

## Installation
Chanjo is distruibuted through "pip". Install the latest release by running:
Chanjo is distruibuted through "pip". Install the latest stable release by running:

```bash
$ pip install chanjo
Expand All @@ -33,8 +38,7 @@ Chanjo exposes a composable command line interface with a nifty config file impl
$ chanjo init --setup
$ chanjo load /path/to/sambamba.output.bed
$ chanjo calculate mean sample1
#sampleId mean-coverage
sample10 176.513223249
{"metrics": {"completeness_10": 90.92, "mean_coverage": 193.85}, "sample_id": "sample1"}
```

## Documentation
Expand All @@ -56,6 +60,7 @@ Chanjo is not the right choice if you care about coverage for every base across
- Robin Andeer
- Luca Beltrame ([lbeltrame](https://github.com/lbeltrame))
- John Kern ([kern3020](https://github.com/kern3020))
- Måns Magnusson ([moonso](https://github.com/moonso))

## License
MIT. See the [LICENSE](LICENSE) file for more details.
Expand All @@ -69,6 +74,7 @@ Anyone can help make this project better - read [CONTRIBUTION](CONTRIBUTION.md)
[bedtools]: http://bedtools.readthedocs.org/en/latest/
[thesis]: https://s3.amazonaws.com/tudo/chanjo/RobinAndeerMastersThesisFinal_2013.pdf
[report]: https://github.com/robinandeer/chanjo-report
[sambamba]: http://lomereiter.github.io/sambamba/

[coveralls-url]: https://coveralls.io/r/robinandeer/chanjo
[coveralls-image]: https://img.shields.io/coveralls/robinandeer/chanjo.svg?style=flat
Expand Down
2 changes: 1 addition & 1 deletion chanjo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@
__email__ = 'robin.andeer@gmail.com'

__license__ = 'MIT'
__copyright__ = 'Copyright 2014 Robin Andeer'
__copyright__ = 'Copyright 2015 Robin Andeer'
21 changes: 8 additions & 13 deletions chanjo/annotate/sambamba/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@


def run_sambamba(bam_file, region_file, outfile=None, cov_treshold=()):
"""
Run sambamba from chanjo.
"""
"""Run sambamba from chanjo."""
logger = logging.getLogger(__name__)
logger = logging.getLogger("chanjo.sambamba")
log_stream = get_log_stream(logger)
Expand All @@ -22,15 +19,14 @@ def run_sambamba(bam_file, region_file, outfile=None, cov_treshold=()):
region_file,
bam_file
]

if outfile:
sambamba_call += ["-o", outfile]

for coverage_treshold in cov_treshold:
sambamba_call += ['-T', str(coverage_treshold)]

logger.info("Running sambamba with call: {0}".format(' '.join(sambamba_call)))


logger.info("Running sambamba with call: %s", ' '.join(sambamba_call))
try:
subprocess.check_call(
sambamba_call,
Expand All @@ -40,10 +36,9 @@ def run_sambamba(bam_file, region_file, outfile=None, cov_treshold=()):
logger.critical("sambamba seems to not exist on your system.")
raise e
except CalledProcessError as e:
logger.critical("Something went wrong when running sambamba. "\
"Please see sambamba error output.")
logger.critical("Something went wrong when running sambamba. "
"Please see sambamba error output.")
raise e

logger.debug("sambamba run successfull")

return
10 changes: 8 additions & 2 deletions chanjo/cli/calculate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import click

from chanjo.store import ChanjoAPI
from chanjo.store.utils import filter_samples
from .utils import dump_json

logger = logging.getLogger(__name__)
Expand All @@ -21,7 +22,10 @@ def calculate(context):
@click.pass_context
def mean(context, samples):
"""Report mean coverage for a list of samples."""
results = context.parent.api.mean(*samples)
api = context.parent.api
query = filter_samples(api.query(), sample_ids=samples)
results = ({'sample_id': sample_id, 'metrics': data}
for sample_id, data in api.means(query))
dump_json(*results)


Expand Down Expand Up @@ -49,7 +53,9 @@ def region(context, sample, per, chromosome, start, end):
logger.debug('region id detected, parse string')
results = api.region_alt(chromosome, sample_id=sample, per=per)
else:
results = api.region(chromosome, start, end, sample_id=sample, per=per)
query = api.region(chromosome, start, end, sample_id=sample, per=per)
results = ({'exon_id': exon_id, 'metrics': data}
for exon_id, data in query)
if per == 'exon':
dump_json(*results)
else:
Expand Down
7 changes: 4 additions & 3 deletions chanjo/cli/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ def init(context, setup, reset, automate):
click.echo(chanjo.__banner__)

if not automate:
questions = [('annotate.cutoff', 'sufficient coverage',
context.obj.get('annotate', {}).get('cutoff', 10)),
questions = [('sambamba.cov_treshold', 'sufficient coverage',
context.obj.get('sambamba', {}).get('cov_treshold',
[10, 20])),
('database', 'central database path/URI',
context.obj['database'])]
# launch init pipeline
Expand Down Expand Up @@ -47,5 +48,5 @@ def init_pipeline(program, config, questions):
for dot_key, value in user_defaults.items():
config.set(dot_key, value, scope=config.user_data)

# Write to the config file
# write to the config file
config.save(default_flow_style=False)
16 changes: 9 additions & 7 deletions chanjo/cli/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,36 @@
import click
from sqlalchemy.exc import IntegrityError

from chanjo.load import sambamba
from chanjo.parse import bed
from chanjo.load.sambamba import rows as sambamba_rows
from chanjo.parse import sambamba
from chanjo.store import Store
from chanjo.utils import validate_stdin

logger = logging.getLogger(__name__)


@click.command()
@click.option('-s', '--sample', help='override sample id from file')
@click.option('-g', '--group', help='id to group related samples')
@click.argument('bed_stream', callback=validate_stdin,
type=click.File(encoding='utf-8'), default='-', required=False)
@click.pass_context
def load(context, group, bed_stream):
def load(context, sample, group, bed_stream):
"""Load Sambamba output into the database for a sample."""
chanjo_db = Store(uri=context.obj['database'])
try:
load_sambamba(chanjo_db, bed_stream, group_id=group)
load_sambamba(chanjo_db, bed_stream, sample_id=sample, group_id=group)
except IntegrityError:
logger.error('sample already loaded, rolling back')
chanjo_db.session.rollback()
context.abort()


def load_sambamba(chanjo_db, bed_iterable, group_id=None):
def load_sambamba(chanjo_db, bed_iterable, sample_id=None, group_id=None):
"""Load Sambamba BED output from a stream."""
rows = bed.chanjo(bed_iterable)
stats = sambamba.rows(chanjo_db.session, rows, group_id=group_id)
rows = sambamba.depth_output(bed_iterable)
stats = sambamba_rows(chanjo_db.session, rows, sample_id=sample_id,
group_id=group_id)
for index, stat in enumerate(stats):
chanjo_db.add(stat)
if index % 10000 == 0:
Expand Down
21 changes: 10 additions & 11 deletions chanjo/cli/root.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,14 @@
"""
import click

import chanjo

from chanjo.compat import text_type
from chanjo.config import Config, CONFIG_FILE_NAME, markup
from chanjo.log import init_log, LEVELS
from chanjo.utils import EntryPointsCLI

from chanjo import __version__, logger


def print_version(ctx, param, value):
"""Callback function for printing version and exiting
Args:
Expand All @@ -33,21 +32,21 @@ def print_version(ctx, param, value):
ctx.exit()

@click.group(cls=EntryPointsCLI)
@click.option('-c', '--config',
default=CONFIG_FILE_NAME,
type=click.Path(),
@click.option('-c', '--config',
default=CONFIG_FILE_NAME,
type=click.Path(),
help='path to config file'
)
@click.option('-d', '--database',
@click.option('-d', '--database',
type=text_type,
help='path/URI of the SQL database'
)
@click.option('-v', '--verbose',
@click.option('-v', '--verbose',
count=True,
default=0,
help="Increase output verbosity. Can be used multiple times, eg. -vv"
)
@click.option('--log_file',
@click.option('--log_file',
type=click.Path()
)
@click.option('--version',
Expand All @@ -60,10 +59,10 @@ def print_version(ctx, param, value):
def root(context, config, database, verbose, log_file):
"""Clinical sequencing coverage analysis tool."""
# setup logging
loglevel = LEVELS.get(min(verbose,2), "WARNING")

loglevel = LEVELS.get(min(verbose, 2), "WARNING")
init_log(logger, loglevel=loglevel, filename=log_file)
logger.info("version {0}".format( __version__))
logger.info("version {0}".format(__version__))

# avoid setting global defaults in Click options, do it below when
# updating the config object
Expand Down
12 changes: 6 additions & 6 deletions chanjo/cli/sambamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from chanjo.annotate.sambamba import run_sambamba


@click.command()
@click.argument('bam_file',
type=click.Path(exists=True),
Expand All @@ -25,32 +26,31 @@
"the percentage of bases in the region"\
"where coverage is more than this value"
)
@click.option('-o', '--outfile',
@click.option('-o', '--outfile',
type=click.Path(exists=False),
help='Specify the path to a file where results should be stored.'
)
@click.pass_context
def sambamba(context, bam_file, exon_bed, gene_bed, cov_treshold, outfile):
"""Run Sambamba from chanjo."""
logger = logging.getLogger(__name__)
#For testing only:
# For testing only:
logger = logging.getLogger("chanjo.cli.sambamba")
logger.info("Running chanjo sambamba")

if not (exon_bed or gene_bed):
logger.warning("Please provide a region file in BED format")
sys.exit()
if exon_bed and gene_bed:
logger.warning("Only one region file at a time")
sys.exit()

region_file = exon_bed
if gene_bed:
region_file = gene_bed

try:
run_sambamba(bam_file, region_file, outfile, cov_treshold)
except Exception as e:
logger.debug(e)
click.Abort()

3 changes: 3 additions & 0 deletions chanjo/config/questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ def ask(prompt, default=None, color='cyan'):
# write default option in parentheses, use it as response if nothing
# was submitted by user.
response = input(build_prompt(prompt, default_string)) or default
if isinstance(default, list) and isinstance(response, str):
sep = ',' if ',' in response else None
response = [int(item) for item in response.split(sep)]

# print the updated confirmation line by replacing the previous
echo(MOVE_CURSOR_UP + ERASE_LINE
Expand Down
30 changes: 10 additions & 20 deletions chanjo/load/sambamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,28 @@
from .utils import get_or_build_exon


def rows(session, row_data, group_id=None):
def rows(session, row_data, sample_id=None, group_id=None):
"""Handle rows of sambamba output.
N.B. only handles single sample annotations.
Args:
session (Session): database session object
row_data (dict): parsed sambamba output rows
sample_id (Optional[str]): id to reference sample
group_id (Optional[str]): id to group samples together
Yields:
ExonStatistic: stats model linked to exon and sample
"""
# use first row to get information on sample
first_row = next(iter(row_data))
sample_obj = sample(first_row, group_id=group_id)
# place the first row back in the stream
all_data = cons(first_row, row_data)
if sample_id is None:
# use first row to get information on sample
first_row = next(iter(row_data))
sample_id = first_row['sampleName']
# place the first row back in the stream
all_data = cons(first_row, row_data)

sample_obj = Sample(sample_id=sample_id, group_id=group_id)
nested_stats = (row(session, data, sample_obj) for data in all_data)
# flatten 2D nested list
return (stat for stats in nested_stats for stat in stats)
Expand All @@ -46,20 +50,6 @@ def row(session, data, sample_obj):
return stats


def sample(data, group_id=None):
"""Create sample model.
Args:
data (dict): parsed sambamba output row
group_id (Optional[str]): id to group samples together
Returns:
Sample: sample database model
"""
sample_obj = Sample(sample_id=data['sampleName'], group=group_id)
return sample_obj


def statistics(data, sample_obj, exon_obj):
"""Create models from a sambamba output row.
Expand Down
3 changes: 2 additions & 1 deletion chanjo/load/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ def _exon_kwargs(data):
Returns:
dict: kwargs prepared for Exon model
"""
return {'exon_id': data['name'], 'chromosome': data['chrom'],
exon_id = data.get('name') or data['extraFields'][0]
return {'exon_id': exon_id, 'chromosome': data['chrom'],
'start': data['chromStart'], 'end': data['chromEnd']}


Expand Down
Loading

0 comments on commit db33d65

Please sign in to comment.