Skip to content

Commit

Permalink
Merge pull request #13 from pombo-lab/v1.1.0-dev
Browse files Browse the repository at this point in the history
V1.1.0 changes
  • Loading branch information
rbeagrie committed May 6, 2017
2 parents 53d6e41 + 80ef0c5 commit f9e0748
Show file tree
Hide file tree
Showing 15 changed files with 290 additions and 27 deletions.
3 changes: 3 additions & 0 deletions lib/gamtools/__init__.py
Expand Up @@ -12,3 +12,6 @@
- gamtools.segregation: functions for dealing with segregation tables (GAM datasets)
"""

#from . import call_windows, compaction, cosegregation, enrichment, matrix, \
# permutation, plotting, radial_position, segregation, utils
46 changes: 40 additions & 6 deletions lib/gamtools/call_windows.py
Expand Up @@ -289,7 +289,7 @@ def filter_data(x, percentile, no_zeros=True):
"""

percentile_score = scoreatpercentile(x, percentile)
less_than_percentile = x < percentile_score
less_than_percentile = list(x < percentile_score)

if no_zeros:
not_a_zero = x > 0
Expand All @@ -302,17 +302,15 @@ def filter_data(x, percentile, no_zeros=True):

out_data = x[points_to_keep]

if len(out_data):
if out_data.size:

return out_data

elif no_zeros:
if no_zeros:

return x[not_a_zero]

else:

return x
return x


def threshold_n_binom(params, p_value, thresh_range=None):
Expand Down Expand Up @@ -580,6 +578,42 @@ def plot_signal_and_noise_fitting(sample_name, fitting_results):
prettify_plot(sample_name, fig)


def fixed_threshold_fitting_func(read_threshold):
"""
Factory function for creating fitting functions that return a pre-specified
threshold for every sample.
:param int read_threshold: Read threshold to use for every sample.
:returns: Fitting function.
"""

read_threshold = int(read_threshold)

def fixed_threshold_fitting(sample_coverage_data):
"""
Return a pre-specified read coverage threshold.
:param sample_coverage_data: Array of read counts per \
genomic window.
:type sample_coverage_data: :class:`~numpy.ndarray`
:returns: Dictionary of fitting results.
"""

counts, breaks = np.histogram(
np.log10(
filter_data(
sample_coverage_data, 99.99)), bins=50)

params = None

return {'read_threshold': read_threshold,
'counts': counts,
'breaks': breaks,
'params': params}

return fixed_threshold_fitting


def signal_and_noise_fitting(sample_coverage_data):
"""
Fit a composite negative binomial and lognormal
Expand Down
4 changes: 2 additions & 2 deletions lib/gamtools/cosegregation.py
Expand Up @@ -64,8 +64,8 @@ def regions_are_valid(regions):

if any(invalid_regions):
return False
else:
return True

return True


def prepare_regions(regions):
Expand Down
9 changes: 6 additions & 3 deletions lib/gamtools/main.py
Expand Up @@ -54,14 +54,17 @@
help='Output segregation file to create '
'(or "-" to write to stdout)')

# If --macs is specified, we don't need to plot the fits
# If a fixed threshold is specified, we don't need to plot the fits
seg_method = call_windows_parser.add_mutually_exclusive_group()
seg_method.add_argument(
'-f', '--fitting-folder', metavar='FITTING_FOLDER',
help='If specified, save the individual curve fittings to this folder')
seg_method.add_argument(
'-m', '--macs', action='store_true',
help='If specified, use MACS to call positive windows')
'-x', '--fixed-threshold', metavar='NUM_READS',
dest='fitting_function',
type=call_windows.fixed_threshold_fitting_func,
help='Instead of fitting each sample individually, use a fixed threshold '
'to determine which windows are positive.')

call_windows_parser.set_defaults(
func=call_windows.threshold_from_args,
Expand Down
66 changes: 59 additions & 7 deletions lib/gamtools/matrix.py
Expand Up @@ -83,7 +83,7 @@
import numpy as np
import pandas as pd

from .segregation import parse_location_string
from . import segregation
from .utils import DelayedImportError

try:
Expand Down Expand Up @@ -112,7 +112,7 @@ def windows_from_name_strings(name_strings):
:returns: List of window tuples
"""

return [parse_location_string(name) for name in name_strings]
return [segregation.parse_location_string(name) for name in name_strings]


def read_npz(filepath):
Expand Down Expand Up @@ -422,7 +422,7 @@ def detect_file_type(file_name):
raise TypeError('Extension "{}" not recognized'.format(file_ext))


def read_file(file_name):
def read_file(file_name, file_type=None):
"""Open a matrix file, guessing the format based on file extension.
:param str file_name: Path to matrix file.
Expand All @@ -431,7 +431,8 @@ def read_file(file_name):
:class:`numpy array <numpy.ndarray>` proximity matrix.
"""

file_type = detect_file_type(file_name)
if file_type is None:
file_type = detect_file_type(file_name)
read_func = INPUT_FORMATS[file_type]
return read_func(file_name)

Expand All @@ -456,6 +457,56 @@ def check_windows(proximity_matrix, windows):
' x '.join([str(s) for s in windows_sizes])))


def region_from_locations(matrix_tuple, location_string1, location_string2=None):
"""Retrieve a sub-region of an interaction matrix based on a location string.
:param matrix_tuple: Tuple of the form (x_windows_list, y_windows_list), matrix
:param str location_string1: UCSC format :ref:`location string <location_string>`
:param str location_string2: UCSC format :ref:`location string <location_string>`, \
if None, location_string2 is set equal to location_string1 (i.e. the function \
returns a symmetrical sub-matrix)
:returns: Tuple of the form (x_windows_list, y_windows_list), matrix
"""

(windows1, windows2), chr_mat = matrix_tuple

windows1_df = pd.DataFrame(
windows1, columns=['chrom', 'start', 'stop']).set_index(['chrom', 'start', 'stop'])
windows2_df = pd.DataFrame(
windows2, columns=['chrom', 'start', 'stop']).set_index(['chrom', 'start', 'stop'])

if location_string2 is None:
location_string2 = location_string1

i1_start, i1_stop = segregation.index_from_location_string(windows1_df, location_string1)
i2_start, i2_stop = segregation.index_from_location_string(windows2_df, location_string2)

subregion_w1 = windows1_df.iloc[i1_start:i1_stop]
subregion_w2 = windows2_df.iloc[i2_start:i2_stop]

submatrix = chr_mat[i1_start:i1_stop, i2_start:i2_stop]

return (subregion_w1, subregion_w2), submatrix


def open_region_from_locations(matrix_path,
location_string1, location_string2=None,
file_type=None):
"""Open an interaction matrix and retrieve a sub-region based on a location string.
:param matrix_path: Path to an interaction matrix
:param str location_string1: UCSC format :ref:`location string <location_string>`
:param str location_string2: UCSC format :ref:`location string <location_string>`, \
if None, location_string2 is set equal to location_string1 (i.e. the function \
returns a symmetrical sub-matrix)
:returns: Tuple of the form (x_windows_list, y_windows_list), matrix
"""

matrix_tuple = read_file(matrix_path, file_type)

return region_from_locations(matrix_tuple, location_string1, location_string2)


def read_thresholds(thresholds_file):
"""Read a file containing interaction thresholds"""

Expand Down Expand Up @@ -492,10 +543,11 @@ def kth_diag_indices(array, diag_k):
rows, cols = np.diag_indices_from(array)
if diag_k < 0:
return rows[:diag_k], cols[-diag_k:]
elif diag_k > 0:

if diag_k > 0:
return rows[diag_k:], cols[:-diag_k]
else:
return rows, cols

return rows, cols


def apply_threshold(proximity_matrix, thresholds):
Expand Down
2 changes: 1 addition & 1 deletion lib/gamtools/pipeline.py
Expand Up @@ -584,7 +584,7 @@ def process_nps_from_args(args):

check_resolution_consistency(args)

if len(args.matrix_sizes):
if args.matrix_sizes:
args.to_run.append('Calculating linkage matrix')

task_dict = {
Expand Down
2 changes: 1 addition & 1 deletion lib/gamtools/plotting.py
Expand Up @@ -35,7 +35,7 @@
genomic_signal = failed_import
failed_packages.append('metaseq')

if len(failed_packages) > 0:
if failed_packages:

if len(failed_packages) > 1:
package_list = ', '.join(failed_packages[:-1])
Expand Down
2 changes: 1 addition & 1 deletion lib/gamtools/qc/pass_qc.py
Expand Up @@ -140,7 +140,7 @@ def parse_conditions_file(conditions_file, stats_df):

fields = line.split()

if (line[0] == '#') or len(fields) == 0:
if (line[0] == '#') or (not fields):
continue

left_str, operator, right_str = fields
Expand Down
2 changes: 1 addition & 1 deletion lib/gamtools/segregation.py
Expand Up @@ -108,7 +108,7 @@ def index_from_interval(segregation_table, interval):

covered_windows = np.nonzero(window_in_region)[0]

if not len(covered_windows):
if not covered_windows.size:
if not chrom in segregation_table.index.levels[0]:
raise InvalidChromError(
'{0} not found in the list of windows'.format(chrom))
Expand Down
42 changes: 42 additions & 0 deletions lib/gamtools/tests/test_call_windows.py
@@ -0,0 +1,42 @@
import io
import pytest
from numpy.testing import assert_array_equal
import numpy as np
from gamtools import call_windows, segregation

fixture_two_samples = io.StringIO(
u"""chrom start stop Sample_A Sample_B
chr1 0 50000 4 0
chr1 50000 100000 0 5
chr1 100000 150000 3 0
chr1 150000 200000 0 0
chr1 200000 250000 0 6
""")

data_two_samples = segregation.open_segregation(fixture_two_samples)

def test_fixed_threshold_4():

threshold_function = call_windows.fixed_threshold_fitting_func(4)

fitting_result = threshold_function(data_two_samples.Sample_A)

assert fitting_result['read_threshold'] == 4
assert 'counts' in fitting_result
assert 'breaks' in fitting_result
assert fitting_result['params'] is None


def test_fixed_coverage_thresholding():

threshold_function = call_windows.fixed_threshold_fitting_func(4)

segregation_table, fitting_result = call_windows.do_coverage_thresholding(
data_two_samples, None, threshold_function)

assert_array_equal(segregation_table.Sample_A,
np.array([0,0,0,0,0]))

assert_array_equal(segregation_table.Sample_B,
np.array([0,1,0,0,1]))

7 changes: 7 additions & 0 deletions lib/gamtools/tests/test_gamtools.py
@@ -0,0 +1,7 @@
import gamtools

def test_members():

for module in ('call_windows', 'compaction', 'cosegregation', 'enrichment', 'matrix',
'permutation', 'plotting', 'radial_position', 'segregation' ,'utils'):
print(dir(getattr(gamtools, module)))

0 comments on commit f9e0748

Please sign in to comment.