Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 11 #20

Merged
merged 7 commits into from
Aug 11, 2017
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions q2_intervention/_intervention.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,19 @@
_between_subject_distance_distribution, _visualize,
_get_paired_differences, _stats_and_visuals,
_add_metric_to_metadata, _linear_effects,
_regplot_subplots_from_dataframe, _load_metadata)
_regplot_subplots_from_dataframe, _load_metadata,
_check_inputs)


def paired_differences(output_dir: str, metadata: qiime2.Metadata,
group_column: str, metric: str, state_column: str,
state_1: str, state_2: str, individual_id_column: str,
parametric: bool=False, palette: str='Set1',
drop_duplicates: bool=True, table: pd.DataFrame=None
drop_replicates: str='error', table: pd.DataFrame=None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this should be called drop_replicates - maybe replicate_handling?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

) -> None:

_check_inputs(state_1, state_2)

# find metric in metadata or derive from table and merge into metadata
metadata = _add_metric_to_metadata(table, metadata, metric)

Expand All @@ -35,24 +38,26 @@ def paired_differences(output_dir: str, metadata: qiime2.Metadata,
individual_id_column=individual_id_column,
group_column=group_column, state_column=state_column,
state_values=[state_1, state_2],
drop_duplicates=drop_duplicates)
drop_replicates=drop_replicates)
pairs[group] = _get_paired_differences(metadata, group_pairs, metric)

# Calculate test statistics and generate boxplots
_stats_and_visuals(
output_dir, pairs, metric, group_column, state_column, state_1,
state_2, individual_id_column, parametric, palette,
drop_duplicates, multiple_group_test=True, pairwise_tests=True,
drop_replicates, multiple_group_test=True, pairwise_tests=True,
paired_difference_tests=True, boxplot=True)


def pairwise_distance(output_dir: str, distance_matrix: DistanceMatrix,
metadata: qiime2.Metadata, group_column: str,
state_column: str, state_1: str, state_2: str,
individual_id_column: str, parametric: bool=False,
palette: str='Set1', drop_duplicates: bool=True,
palette: str='Set1', drop_replicates: str='error',
between_group_distance: bool=False) -> None:

_check_inputs(state_1, state_2)

metadata = _load_metadata(metadata)

# calculate pairwise distance distributions
Expand All @@ -64,7 +69,7 @@ def pairwise_distance(output_dir: str, distance_matrix: DistanceMatrix,
individual_id_column=individual_id_column,
group_column=group_column, state_column=state_column,
state_values=[state_1, state_2],
drop_duplicates=drop_duplicates)
drop_replicates=drop_replicates)
pairs[group] = _extract_distance_distribution(
distance_matrix, group_pairs)
if between_group_distance:
Expand All @@ -76,7 +81,7 @@ def pairwise_distance(output_dir: str, distance_matrix: DistanceMatrix,
_stats_and_visuals(
output_dir, pairs, 'distance', group_column,
state_column, state_1, state_2, individual_id_column,
parametric, palette, drop_duplicates, multiple_group_test=True,
parametric, palette, drop_replicates, multiple_group_test=True,
pairwise_tests=True, paired_difference_tests=False, boxplot=True)


Expand Down
28 changes: 20 additions & 8 deletions q2_intervention/_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,28 +27,40 @@
TEMPLATES = pkg_resources.resource_filename('q2_intervention', 'assets')


def _check_inputs(state_1, state_2):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you rename this, maybe _validate_state_values, to be a little more self-documenting?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I intend to validate more inputs in a future PR so will use _validate_input_values instead of _validate_state_values

if state_1 == state_2:
raise ValueError((
'You have chosen the same value for state_1 and state_2. These '
'parameters must be given different values.'))


def _get_group_pairs(df, group_value, individual_id_column='SubjectID',
group_column='Group', state_column='time_point',
state_values=['1', '2'], drop_duplicates=True):
state_values=['1', '2'], drop_replicates='error'):
results = []
group_members = df[group_column] == group_value
group_md = df[group_members]
for individual_id in set(group_md[individual_id_column]):
result = []
for state_value in state_values:
state_value = df[state_column].dtype.type(state_value)
individual_id = \
df[individual_id_column].dtype.type(individual_id)
individual_id = df[individual_id_column].dtype.type(individual_id)
_state = df[state_column] == state_value
_ind = df[individual_id_column] == individual_id
individual_at_state_idx = group_md[_state & _ind].index
if len(individual_at_state_idx) > 1:
print("Multiple values for {0} {1} at {2} {3} ({4})".format(
individual_id_column, individual_id, state_column,
state_value, ' '.join(map(str, individual_at_state_idx))))
if drop_duplicates:
if drop_replicates == 'error':
raise ValueError((
'Replicate values for individual {0} at state {1}. '
'Remove replicate values from input files or set '
'drop_replicates parameter to select how replicates '
'are handled.'))
elif drop_replicates == 'drop':
break
else:
elif drop_replicates == 'random':
individual_at_state_idx = [choice(individual_at_state_idx)]
elif len(individual_at_state_idx) == 0:
print("No values for {0} {1} at {2} {3}".format(
Expand Down Expand Up @@ -355,7 +367,7 @@ def _visualize(output_dir, multiple_group_test=False, pairwise_tests=False,
def _stats_and_visuals(output_dir, pairs, metric, group_column,
state_column, state_1, state_2,
individual_id_column, parametric, palette,
drop_duplicates,
drop_replicates,
multiple_group_test=True, pairwise_tests=True,
paired_difference_tests=True, boxplot=True):
# kruskal test or ANOVA between groups
Expand All @@ -381,10 +393,10 @@ def _stats_and_visuals(output_dir, pairs, metric, group_column,

summary = pd.Series(
[metric, group_column, state_column, state_1, state_2,
individual_id_column, parametric, drop_duplicates],
individual_id_column, parametric, drop_replicates],
index=['Metric', 'Group column', 'State column', 'State 1',
'State 2', 'Individual ID column', 'Parametric',
'Drop duplicates'],
'Drop replicates'],
name='Paired difference tests')

_visualize(output_dir, multiple_group_test, pairwise_tests,
Expand Down
18 changes: 11 additions & 7 deletions q2_intervention/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@
'state_1': Str,
'state_2': Str,
'parametric': Bool,
'drop_duplicates': Bool,
'drop_replicates': Str % Choices(
['error', 'random', 'drop', 'mean', 'median']),
}

base_parameter_descriptions = {
Expand All @@ -55,9 +56,9 @@
'across which samples are paired.'),
'individual_id_column': (
'Metadata column containing subject IDs to use for pairing '
'samples. WARNING: if duplicates exist for an individual ID at '
'samples. WARNING: if replicates exist for an individual ID at '
'either state_1 or state_2, that subject will be dropped and '
'reported in standard output by default. Set duplicates="ignore" '
'reported in standard output by default. Set replicates="ignore" '
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be random instead of ignore?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

'to instead randomly select one member, and use --verbose to list '
'conflicts.'),
'palette': 'Color palette to use for generating boxplots.',
Expand All @@ -72,10 +73,13 @@
'parametric': ('Perform parametric (ANOVA and t-tests) or non-'
'parametric (Kruskal-Wallis, Wilcoxon, and Mann-'
'Whitney U tests) statistical tests.'),
'drop_duplicates': (
'If True, will discard all subject IDs with duplicate samples '
'at either state_1 or state_2. If False, will instead '
'choose one representative at random from among duplicates.')
'drop_replicates': (
'Choose how replicate samples are handled. If replicates are '
'detected, "error" causes method to fail; "drop" will discard all '
'subject IDs with replicate samples at either state_1 or state_2; '
'"random" chooses one representative at random from among '
'replicates; "mean" and "median" compute average values across '
'replicates.')
}


Expand Down
Binary file removed q2_intervention/test_data/ecam_shannon.qza
Binary file not shown.
13 changes: 7 additions & 6 deletions q2_intervention/tests/test_intervention.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,15 @@ class UtilitiesTests(InterventionTestPluginBase):
def test_get_group_pairs(self):
res = _get_group_pairs(
md, 'a', individual_id_column='ind', group_column='Group',
state_column='Time', state_values=[1, 2])
state_column='Time', state_values=[1, 2], drop_replicates='drop')
self.assertEqual(res, [('0', '3'), ('1', '4'), ('2', '5')])
res = _get_group_pairs(
md_dup, 'a', individual_id_column='ind', group_column='Group',
state_column='Time', state_values=[1, 2])
state_column='Time', state_values=[1, 2], drop_replicates='drop')
self.assertEqual(res, [('0', '3')])
res = _get_group_pairs(
md_dup, 'a', individual_id_column='ind', group_column='Group',
state_column='Time', state_values=[1, 2], drop_duplicates=False)
state_column='Time', state_values=[1, 2], drop_replicates='random')
self.assertEqual(res[0], ('0', '3'))
self.assertIn(res[1], [('1', '4'), ('2', '4')])

Expand Down Expand Up @@ -146,22 +146,23 @@ def test_paired_differences(self):
output_dir=self.temp_dir.name, table=None,
metadata=self.md_ecam_fp, group_column='delivery',
state_column='month', state_1=0, state_2=3,
individual_id_column='studyid', metric='observed_otus')
individual_id_column='studyid', metric='observed_otus',
drop_replicates='drop')

def test_paired_differences_taxa(self):
paired_differences(
output_dir=self.temp_dir.name, table=self.table_ecam_fp,
metadata=self.md_ecam_fp, group_column='delivery',
state_column='month', state_1=0, state_2=3,
individual_id_column='studyid',
metric='e2c3ff4f647112723741aa72087f1bfa')
metric='e2c3ff4f647112723741aa72087f1bfa', drop_replicates='drop')

def test_pairwise_distance(self):
pairwise_distance(
output_dir=self.temp_dir.name, distance_matrix=self.md_ecam_dm,
metadata=self.md_ecam_fp, group_column='delivery',
state_column='month', state_1=0, state_2=3,
individual_id_column='studyid')
individual_id_column='studyid', drop_replicates='drop')

def test_linear_mixed_effects(self):
linear_mixed_effects(
Expand Down