From b990ca22274ac2b783de3845edc3adda2506ec3e Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 2 Jan 2023 10:25:19 +0100 Subject: [PATCH 01/25] feat: allow a list of factors in build_design_matrix --- pydeseq2/utils.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index 98013d7b..863a0fc6 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -151,9 +151,10 @@ def build_design_matrix( DataFrame containing clinical information. Must be indexed by sample barcodes, and contain a "high_grade" column. - design : str + design : str TODO : or list Name of the column of clinical_df to be used as a design_matrix variable. (default: "high_grade"). + TODO : last is the "reference" ref : str The factor to use as a reference. Must be one of the values taken by the design. @@ -176,6 +177,8 @@ def build_design_matrix( design_matrix = pd.get_dummies(clinical_df[design]) + # TODO : check that each factor takes only two values + if design_matrix.shape[-1] == 1: raise ValueError( f"The design factor takes only one value: " @@ -200,8 +203,12 @@ def build_design_matrix( ) raise e design_matrix.insert(1, ref, ref_level) - if not expanded: # drop last factor - design_matrix.drop(columns=design_matrix.columns[-1], axis=1, inplace=True) + if not expanded: # drop duplicate factors + # TODO : remove commented line + # design_matrix.drop(columns=design_matrix.columns[-1], axis=1, inplace=True) + design_matrix.drop( + columns=[col for col in clinical_df.columns[::-2]], axis=1, inplace=True + ) # Drops every other column, starting from the last if intercept: design_matrix.insert(0, "intercept", 1.0) return design_matrix From 3d889bf58026238743f79bac95dbbaf8dd0ae0fc Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 2 Jan 2023 15:12:09 +0100 Subject: [PATCH 02/25] feat: improve multifactor design function and limit number of IRLS iterations --- pydeseq2/utils.py | 66 ++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index 863a0fc6..becec9b0 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -144,6 +144,7 @@ def build_design_matrix( Only single factor, 2-level designs are currently supported. Unless specified, the reference factor is chosen alphabetically. + NOTE: For now, each factor should have exactly two levels. Parameters ---------- @@ -151,10 +152,9 @@ def build_design_matrix( DataFrame containing clinical information. Must be indexed by sample barcodes, and contain a "high_grade" column. - design : str TODO : or list - Name of the column of clinical_df to be used as a design_matrix variable. + design : str or list + Name of the columns of clinical_df to be used as design_matrix variables. (default: "high_grade"). - TODO : last is the "reference" ref : str The factor to use as a reference. Must be one of the values taken by the design. @@ -175,39 +175,36 @@ def build_design_matrix( Indexed by sample barcodes. """ - design_matrix = pd.get_dummies(clinical_df[design]) + if isinstance(design, str): # if there is a single factor, convert to singleton list + design = [design] - # TODO : check that each factor takes only two values + design_matrix = pd.get_dummies(clinical_df[design]) - if design_matrix.shape[-1] == 1: - raise ValueError( - f"The design factor takes only one value: " - f"{design_matrix.columns.values.tolist()}, but two are necessary." - ) - elif design_matrix.shape[-1] > 2: - raise ValueError( - f"The design factor takes more than two values: " - f"{design_matrix.columns.values.tolist()}, but should have exactly two." - ) - # Sort columns alphabetically - design_matrix = design_matrix.reindex( - sorted(design_matrix.columns, reverse=True), axis=1 - ) - if ref is not None: # Put reference level last + for factor in design: + # Check that each factor has exactly 2 levels + if len(np.unique(clinical_df[factor])) != 2: + print( + f"Factors should take exactly two values, but {factor} " + f"takes values {np.unique(clinical_df[factor])}." + ) + factor = design[-1] + if ref is None: + ref = "_".join([factor, np.sort(np.unique(clinical_df[factor]).astype(str))[0]]) + ref_level = design_matrix.pop(ref) + else: # Put reference level last + ref = "_".join([factor, ref]) try: - ref_level = design_matrix.pop(ref) + ref_level = design_matrix.pop(f"{factor}_{ref}") except KeyError as e: print( "The reference level must correspond to" " one of the design factor values." ) raise e - design_matrix.insert(1, ref, ref_level) + design_matrix.insert(design_matrix.shape[-1], ref, ref_level) if not expanded: # drop duplicate factors - # TODO : remove commented line - # design_matrix.drop(columns=design_matrix.columns[-1], axis=1, inplace=True) design_matrix.drop( - columns=[col for col in clinical_df.columns[::-2]], axis=1, inplace=True + columns=[col for col in design_matrix.columns[::-2]], axis=1, inplace=True ) # Drops every other column, starting from the last if intercept: design_matrix.insert(0, "intercept", 1.0) @@ -318,11 +315,11 @@ def irls_solver( min_beta=-30, max_beta=30, optimizer="L-BFGS-B", + maxiter=100, ): r"""Fit a NB GLM wit log-link to predict counts from the design matrix. See equations (1-2) in the DESeq2 paper. - Uses the L-BFGS-B, more stable than Fisher scoring. Parameters ---------- @@ -355,7 +352,11 @@ def irls_solver( Optimizing method to use in case IRLS starts diverging. Accepted values: 'BFGS' or 'L-BFGS-B'. NB: only 'L-BFGS-B' ensures that LFCS will - lay in the [min_beta, max_beta] range. (default: 'BFGS'). + lay in the [min_beta, max_beta] range. (default: 'L-BFGS-B'). + + maxiter : int + Maximum number of IRLS iterations to perform before switching to L-BFGS-B. + (default: 100) Returns ------- @@ -398,13 +399,16 @@ def irls_solver( mu = np.maximum(size_factors * np.exp(X @ beta), min_mu) converged = True + i = 0 while dev_ratio > beta_tol: W = mu / (1.0 + mu * disp) z = np.log(mu / size_factors) + (counts - mu) / mu H = (X.T * W) @ X + D beta_hat = solve(H, X.T @ (W * z), assume_a="pos") - if sum(np.abs(beta_hat) > max_beta) > 0: + i += 1 + + if sum(np.abs(beta_hat) > max_beta) > 0 or i >= maxiter: # If IRLS starts diverging, use L-BFGS-B def f(beta): # closure to minimize @@ -424,16 +428,14 @@ def df(beta): beta_init, jac=df, method=optimizer, - bounds=[(min_beta, max_beta), (min_beta, max_beta)] - if optimizer == "L-BFGS-B" - else None, + bounds=[(min_beta, max_beta)] * p if optimizer == "L-BFGS-B" else None, ) beta = res.x mu = np.maximum(size_factors * np.exp(X @ beta), min_mu) converged = res.success - if not res.success: + if not res.success and p <= 2: beta = grid_fit_beta( counts, size_factors, From ad43332c08aa43ad41306d38a98abcc9d847625e Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 2 Jan 2023 15:20:35 +0100 Subject: [PATCH 03/25] docs(DeseqDataSet): update to reflect that the design may take several factors --- pydeseq2/DeseqDataSet.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py index 17bf63fc..21b5a268 100644 --- a/pydeseq2/DeseqDataSet.py +++ b/pydeseq2/DeseqDataSet.py @@ -43,8 +43,8 @@ class DeseqDataSet: DataFrame containing clinical information. Must be indexed by sample barcodes. - design_factor : str - Name of the column of clinical to be used as a design variable. + design_factor : str or list[str] + Name of the columns of clinical to be used as design variables. (default: 'high_grade'). reference_level : str @@ -182,7 +182,7 @@ def __init__( "The count matrix and clinical data should have the same index." ) self.design_factor = design_factor - if self.clinical[self.design_factor].isna().any(): + if self.clinical[self.design_factor].isna().any().any(): raise ValueError("NaNs are not allowed in the design factor.") self.clinical[self.design_factor] = self.clinical[self.design_factor].astype(str) From 7f26c275bd07b4fd09f3a104824ad2f21acc4a8e Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 2 Jan 2023 15:23:54 +0100 Subject: [PATCH 04/25] docs(DeseqDataSet): update to reflect that the design may take several factors --- pydeseq2/DeseqDataSet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py index 21b5a268..7184824b 100644 --- a/pydeseq2/DeseqDataSet.py +++ b/pydeseq2/DeseqDataSet.py @@ -44,7 +44,8 @@ class DeseqDataSet: Must be indexed by sample barcodes. design_factor : str or list[str] - Name of the columns of clinical to be used as design variables. + Name of the columns of clinical to be used as design variables. If a list, + the last factor will be considered the variable of interest by default. (default: 'high_grade'). reference_level : str From d6382b7b6375f1ca6d8574705437e0034558b8c8 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 2 Jan 2023 16:49:01 +0100 Subject: [PATCH 05/25] feat: support pairwise multifactor tests --- pydeseq2/DeseqStats.py | 196 +++++++++++++++++++++++++---------------- 1 file changed, 118 insertions(+), 78 deletions(-) diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py index 1281f9df..b3f0fc9f 100644 --- a/pydeseq2/DeseqStats.py +++ b/pydeseq2/DeseqStats.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd import statsmodels.api as sm +from IPython.display import display from joblib import Parallel from joblib import delayed from joblib import parallel_backend @@ -28,8 +29,15 @@ class DeseqStats: dds : DeseqDataSet DeseqDataSet for which dispersion and LFCs were already estimated. + contrast : list[str] or None + A list of three strings, in the following format: + ['variable_of_interest', 'tested_level', 'reference_level']. + Names must correspond to the clinical data passed to the DeseqDataSet. + (default: None) + alpha : float - P-value and adjusted p-value significance threshold (usually 0.05). (default: 0.05). + P-value and adjusted p-value significance threshold (usually 0.05). + (default: 0.05). cooks_filter : bool Whether to filter p-values based on cooks outliers. (default: True). @@ -56,6 +64,9 @@ class DeseqStats: base_mean : pandas.Series Genewise means of normalized counts. + contrast_idx : int + Index of the LFC column corresponding to the variable being tested. + LFCs : pandas.DataFrame Estimated log-fold change between conditions and intercept, in natural log scale. @@ -95,6 +106,7 @@ class DeseqStats: def __init__( self, dds, + contrast=None, alpha=0.05, cooks_filter=True, independent_filter=True, @@ -108,6 +120,19 @@ def __init__( ), "Please provide a fitted DeseqDataSet by first running the `deseq2` method." self.dds = dds + + # Build contrast if None + if contrast is not None: + # TODO : tests on the constrast + self.contrast = contrast + else: + factor = self.dds.design_factor[-1] + levels = np.unique(self.dds.clinical[factor]).astype(str) + if "_".join([factor, levels[0]]) == self.dds.design_matrix.columns[-1]: + self.contrast = [factor, levels[0], levels[1]] + else: + self.contrast = [factor, levels[1], levels[0]] + self.alpha = alpha self.cooks_filter = cooks_filter self.independent_filter = independent_filter @@ -121,14 +146,9 @@ def __init__( self.joblib_verbosity = joblib_verbosity def summary(self): - """Run the statistical analysis and return the results in a DataFrame. - - The results are also stored in the `results_df` attribute. + """Run the statistical analysis. - Returns - ------- - pandas.DataFrame - Estimated log fold changes, p-values and adjusted p-values for each gene. + The results are stored in the `results_df` attribute. """ if not hasattr(self, "p_values"): @@ -151,15 +171,99 @@ def summary(self): # Store the results in a DataFrame, in log2 scale for LFCs. self.results_df = pd.DataFrame(index=self.dds.dispersions.index) self.results_df["baseMean"] = self.base_mean - self.results_df["log2FoldChange"] = self.LFCs[ - self.dds.design_matrix.columns[-1] + self.results_df["log2FoldChange"] = self.LFCs.iloc[ + :, self.contrast_idx ] / np.log(2) self.results_df["lfcSE"] = self.SE / np.log(2) self.results_df["stat"] = self.statistics self.results_df["pvalue"] = self.p_values self.results_df["padj"] = self.padj - return self.results_df + print( + f"Log2 fold change & Wald test p-value: " + f"{self.contrast[0]} {self.contrast[1]} vs {self.contrast[2]}" + ) + display(self.results_df) + + def run_wald_test(self): + """Perform a Wald test. + + Get gene-wise p-values for gene over/under-expression.` + """ + + n = len(self.LFCs) + p = self.dds.design_matrix.shape[1] + + # Raise a warning if LFCs are shrunk. + if self.shrunk_LFCs: + print( + "Note: running Wald test on shrunk LFCs. " + "Some sequencing datasets show better performance with the testing " + "separated from the use of the LFC prior." + ) + + # Get the LFC column corresponding to the design variable. + if self.contrast is None: + self.contrast_idx = self.LFCs.columns.get_loc( + self.dds.design_matrix.columns[-1] + ) + else: + # If the design matrix of dds is unexpanded, only one of the two levels + # will have a corresponding column + alternative_level = "_".join([self.contrast[0], self.contrast[1]]) + try: + self.contrast_idx = self.LFCs.columns.get_loc(alternative_level) + except KeyError: + alternative_level = "_".join([self.contrast[0], self.contrast[2]]) + self.contrast_idx = self.LFCs.columns.get_loc(alternative_level) + self.LFCs.iloc[:, self.contrast_idx] *= -1 + pass + + # Compute means according to the LFC model. + mu = ( + np.exp(self.dds.design_matrix @ self.LFCs.T) + .multiply(self.dds.size_factors, 0) + .values + ) + + # Set regularization factors. + if self.prior_disp_var is not None: + D = np.diag(1 / self.prior_disp_var**2) + else: + D = np.diag(np.repeat(1e-6, p)) + + X = self.dds.design_matrix.values + disps = self.dds.dispersions.values + LFCs = self.LFCs.values + + print("Running Wald tests...") + start = time.time() + with parallel_backend("loky", inner_max_num_threads=1): + res = Parallel( + n_jobs=self.n_processes, + verbose=self.joblib_verbosity, + batch_size=self.batch_size, + )( + delayed(wald_test)(X, disps[i], LFCs[i], mu[:, i], D, self.contrast_idx) + for i in range(n) + ) + end = time.time() + print(f"... done in {end-start:.2f} seconds.\n") + + pvals, stats, se = zip(*res) + + self.p_values = pd.Series(pvals, index=self.LFCs.index) + self.statistics = pd.Series(stats, index=self.LFCs.index) + self.SE = pd.Series(se, index=self.LFCs.index) + + # Account for possible all_zeroes due to outlier refitting in DESeqDataSet + if self.dds.refit_cooks and self.dds.replaced.sum() > 0: + + self.SE.loc[self.dds.new_all_zeroes[self.dds.new_all_zeroes].index] = 0 + self.statistics.loc[ + self.dds.new_all_zeroes[self.dds.new_all_zeroes].index + ] = 0 + self.p_values.loc[self.dds.new_all_zeroes[self.dds.new_all_zeroes].index] = 1 def lfc_shrink(self): """LFC shrinkage with an apeGLM prior [2]_. @@ -232,72 +336,6 @@ def lfc_shrink(self): self.results_df["lfcSE"] = self.SE / np.log(2) return self.results_df - def run_wald_test(self): - """Perform a Wald test. - - Get gene-wise p-values for gene over/under-expression.` - """ - - n = len(self.LFCs) - p = self.dds.design_matrix.shape[1] - - # Raise a warning if LFCs are shrunk. - if self.shrunk_LFCs: - print( - "Note: running Wald test on shrunk LFCs. " - "Some sequencing datasets show better performance with the testing " - "separated from the use of the LFC prior." - ) - - # Get the LFC column corresponding to the design variable. - idx = self.LFCs.columns.get_loc(self.dds.design_matrix.columns[-1]) - - # Compute means according to the LFC model. - mu = ( - np.exp(self.dds.design_matrix @ self.LFCs.T) - .multiply(self.dds.size_factors, 0) - .values - ) - - # Set regularization factors. - if self.prior_disp_var is not None: - D = np.diag(1 / self.prior_disp_var**2) - else: - D = np.diag(np.repeat(1e-6, p)) - - X = self.dds.design_matrix.values - disps = self.dds.dispersions.values - LFCs = self.LFCs.values - - print("Running Wald tests...") - start = time.time() - with parallel_backend("loky", inner_max_num_threads=1): - res = Parallel( - n_jobs=self.n_processes, - verbose=self.joblib_verbosity, - batch_size=self.batch_size, - )( - delayed(wald_test)(X, disps[i], LFCs[i], mu[:, i], D, idx) - for i in range(n) - ) - end = time.time() - print(f"... done in {end-start:.2f} seconds.\n") - - pvals, stats, se = zip(*res) - - self.p_values = pd.Series(pvals, index=self.LFCs.index) - self.statistics = pd.Series(stats, index=self.LFCs.index) - self.SE = pd.Series(se, index=self.LFCs.index) - - # Account for possible all_zeroes due to outlier refitting in DESeqDataSet - if self.dds.refit_cooks and self.dds.replaced.sum() > 0: - - self.SE.loc[self.dds.new_all_zeroes[self.dds.new_all_zeroes].index] = 0 - self.statistics.loc[ - self.dds.new_all_zeroes[self.dds.new_all_zeroes].index - ] = 0 - self.p_values.loc[self.dds.new_all_zeroes[self.dds.new_all_zeroes].index] = 1 - def _independent_filtering(self): """Compute adjusted p-values using independent filtering. @@ -368,7 +406,7 @@ def _cooks_filtering(self): self.run_wald_test() m, p = self.dds.design_matrix.shape - if p == 2: + if p == 2: # TODO : adapt to multifactor designs # If for a gene there are 3 samples or more that have more counts than the # maximum cooks sample, don't count this gene as an outlier. # Do this only if there are 2 cohorts. @@ -426,6 +464,8 @@ def _fit_prior_var(self, min_var=1e-6, max_var=400): float Estimated prior variance. """ + + # TODO : adapt to multifactor keep = ~self.LFCs.iloc[:, 1].isna() S = self.LFCs[keep].iloc[:, 1] ** 2 D = self.SE[keep] ** 2 From 6b2ebecf9c4e2669981eed38ba03f6e9f52aa88c Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 2 Jan 2023 17:24:38 +0100 Subject: [PATCH 06/25] fix(DeseqStats): handle multi-factor designs in _cooks_filtering --- pydeseq2/DeseqStats.py | 57 ++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py index b3f0fc9f..19f5ab6b 100644 --- a/pydeseq2/DeseqStats.py +++ b/pydeseq2/DeseqStats.py @@ -406,45 +406,48 @@ def _cooks_filtering(self): self.run_wald_test() m, p = self.dds.design_matrix.shape - if p == 2: # TODO : adapt to multifactor designs - # If for a gene there are 3 samples or more that have more counts than the - # maximum cooks sample, don't count this gene as an outlier. - # Do this only if there are 2 cohorts. - cooks_cutoff = f.ppf(0.99, p, m - p) + cooks_cutoff = f.ppf(0.99, p, m - p) + # If for a gene there are 3 samples or more that have more counts than the + # maximum cooks sample, don't count this gene as an outlier. + # Do this only if there are 2 cohorts. + if p == 2: # Check whether cohorts have enough samples to allow refitting # Only consider conditions with 3 or more samples (same as in R) n_or_more = ( - self.dds.design_matrix[self.dds.design_matrix.columns[-1]].value_counts() - >= 3 + self.dds.design_matrix.iloc[:, self.contrast_idx].value_counts() >= 3 ) use_for_max = pd.Series( - n_or_more[self.dds.design_matrix[self.dds.design_matrix.columns[-1]]] + n_or_more[self.dds.design_matrix.iloc[:, self.contrast_idx]] ) use_for_max.index = self.dds.design_matrix.index - # Take into account whether we already replaced outliers - if self.dds.refit_cooks and self.dds.replaced.sum() > 0: - cooks_outlier = ( - self.dds.replace_cooks.loc[:, use_for_max] > cooks_cutoff - ).any(axis=1) + else: + use_for_max = pd.Series(True, index=self.dds.design_matrix.index) - else: - cooks_outlier = (self.dds.cooks.loc[:, use_for_max] > cooks_cutoff).any( - axis=1 - ) + # Take into account whether we already replaced outliers + if self.dds.refit_cooks and self.dds.replaced.sum() > 0: + cooks_outlier = ( + self.dds.replace_cooks.loc[:, use_for_max] > cooks_cutoff + ).any(axis=1) - pos = self.dds.cooks[cooks_outlier].to_numpy().argmax(1) - cooks_outlier.update( - ( - self.dds.counts.loc[:, cooks_outlier] - > self.dds.counts.loc[:, cooks_outlier].to_numpy()[ - pos, np.arange(len(pos)) - ] - ).sum(0) - < 3 + else: + cooks_outlier = (self.dds.cooks.loc[:, use_for_max] > cooks_cutoff).any( + axis=1 ) - self.p_values[cooks_outlier] = np.nan + + pos = self.dds.cooks[cooks_outlier].to_numpy().argmax(1) + cooks_outlier.update( + ( + self.dds.counts.loc[:, cooks_outlier] + > self.dds.counts.loc[:, cooks_outlier].to_numpy()[ + pos, np.arange(len(pos)) + ] + ).sum(0) + < 3 + ) + + self.p_values[cooks_outlier] = np.nan def _fit_prior_var(self, min_var=1e-6, max_var=400): """Estimate the prior variance of the apeGLM model. From 442bac30df56513e9c05ba1d60d493cea4e32f15 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Tue, 3 Jan 2023 12:17:15 +0100 Subject: [PATCH 07/25] feat: add multifactor support for apeglm lfc shrinkage --- pydeseq2/DeseqStats.py | 37 +++++++++++++++++++++++++-------- pydeseq2/utils.py | 47 ++++++++++++++++++++++++++++-------------- 2 files changed, 59 insertions(+), 25 deletions(-) diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py index 19f5ab6b..53cdbb26 100644 --- a/pydeseq2/DeseqStats.py +++ b/pydeseq2/DeseqStats.py @@ -217,6 +217,7 @@ def run_wald_test(self): alternative_level = "_".join([self.contrast[0], self.contrast[2]]) self.contrast_idx = self.LFCs.columns.get_loc(alternative_level) self.LFCs.iloc[:, self.contrast_idx] *= -1 + self._change_lfc_sign = True pass # Compute means according to the LFC model. @@ -269,6 +270,7 @@ def lfc_shrink(self): """LFC shrinkage with an apeGLM prior [2]_. Shrinks LFCs using a heavy-tailed Cauchy prior, leaving p-values unchanged. + TODO : for now, shrinks the LFCs of the variable given by self.contrast Returns ------- @@ -306,6 +308,8 @@ def lfc_shrink(self): offset, prior_no_shrink_scale, prior_scale, + "L-BFGS-B", + self.contrast_idx, ) for i in range(m) ) @@ -314,27 +318,43 @@ def lfc_shrink(self): lfcs, inv_hessians, l_bfgs_b_converged_ = zip(*res) - self.LFCs.iloc[:, 1] = pd.Series( - np.array(lfcs)[:, 1], index=self.dds.dispersions[nonzero].index + self.LFCs.iloc[:, self.contrast_idx] = pd.Series( + np.array(lfcs)[:, self.contrast_idx], + index=self.dds.dispersions[nonzero].index, ) self.SE = pd.Series( - np.array([np.sqrt(np.abs(inv_hess[1, 1])) for inv_hess in inv_hessians]), + np.array( + [ + np.sqrt(np.abs(inv_hess[self.contrast_idx, self.contrast_idx])) + for inv_hess in inv_hessians + ] + ), index=self.dds.dispersions[nonzero].index, ) self._lcf_shrink_converged = pd.Series( np.array(l_bfgs_b_converged_), index=self.dds.dispersions[nonzero].index ) + # Change sign to comply with contrast, if needed + if self._change_lfc_sign: + self.LFCs.iloc[:, self.contrast_idx] *= -1 + # Set a flag to indicate that LFCs were shrunk self.shrunk_LFCs = True # Replace in results dataframe, if it exists if hasattr(self, "results_df"): - self.results_df["log2FoldChange"] = self.LFCs[ - self.dds.design_matrix.columns[-1] + self.results_df["log2FoldChange"] = self.LFCs.iloc[ + :, self.contrast_idx ] / np.log(2) self.results_df["lfcSE"] = self.SE / np.log(2) - return self.results_df + + print( + f"Shrunk Log2 fold change & Wald test p-value: " + f"{self.contrast[0]} {self.contrast[1]} vs {self.contrast[2]}" + ) + + display(self.results_df) def _independent_filtering(self): """Compute adjusted p-values using independent filtering. @@ -468,9 +488,8 @@ def _fit_prior_var(self, min_var=1e-6, max_var=400): Estimated prior variance. """ - # TODO : adapt to multifactor - keep = ~self.LFCs.iloc[:, 1].isna() - S = self.LFCs[keep].iloc[:, 1] ** 2 + keep = ~self.LFCs.iloc[:, self.contrast_idx].isna() + S = self.LFCs[keep].iloc[:, self.contrast_idx] ** 2 D = self.SE[keep] ** 2 def objective(a): diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index becec9b0..611c4403 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -950,18 +950,25 @@ def nbinomGLM( converged: bool Whether L-BFGS-B converged. """ - beta_init = np.array([0.1, -0.1]) - no_shrink_index = 1 - shrink_index + + p = design_matrix.shape[-1] + + shrink_mask = np.zeros(p) + shrink_mask[shrink_index] = 1 + no_shrink_mask = np.ones(p) - shrink_mask + + beta_init = np.ones(p) * 0.1 * (-1) ** (np.arange(p)) # Set optimization scale scale_cnst = nbinomFn( - np.array([0, 0]), + np.zeros(p), design_matrix, counts, size, offset, prior_no_shrink_scale, prior_scale, + shrink_index, ) scale_cnst = np.maximum(scale_cnst, 1) @@ -976,6 +983,7 @@ def f(beta, cnst=scale_cnst): offset, prior_no_shrink_scale, prior_scale, + shrink_index, ) / cnst ) @@ -983,11 +991,9 @@ def f(beta, cnst=scale_cnst): def df(beta, cnst=scale_cnst): # Gradient of the function to optimize xbeta = design_matrix @ beta - d_neg_prior = np.array( - [ - beta[no_shrink_index] / prior_no_shrink_scale**2, - 2 * beta[shrink_index] / (prior_scale**2 + beta[shrink_index] ** 2), - ] + d_neg_prior = ( + beta * no_shrink_mask / prior_no_shrink_scale**2 + + 2 * beta * shrink_mask / (prior_scale**2 + beta[shrink_index] ** 2), ) d_nll = ( @@ -998,18 +1004,21 @@ def df(beta, cnst=scale_cnst): def ddf(beta, cnst=scale_cnst): # Hessian of the function to optimize + # Note: will only work if there is a single shrink index xbeta = design_matrix @ beta exp_xbeta_off = np.exp(xbeta + offset) frac = (counts + size) * size * exp_xbeta_off / (size + exp_xbeta_off) ** 2 + # Build diagonal h11 = 1 / prior_no_shrink_scale**2 h22 = ( 2 * (prior_scale**2 - beta[shrink_index] ** 2) / (prior_scale**2 + beta[shrink_index] ** 2) ** 2 ) - return ( - 1 / cnst * ((design_matrix.T * frac) @ design_matrix + np.diag([h11, h22])) - ) + + h = np.diag(no_shrink_mask * h11 + shrink_mask * h22) + + return 1 / cnst * ((design_matrix.T * frac) @ design_matrix + np.diag(h)) res = minimize( f, @@ -1022,8 +1031,9 @@ def ddf(beta, cnst=scale_cnst): beta = res.x converged = res.success - if not converged: + if not converged and p == 2: # If the solver failed, fit using grid search (slow) + # Only for single-factor analysis beta = grid_fit_shrink_beta( counts, offset, @@ -1088,11 +1098,16 @@ def nbinomFn( Sum of the NB negative likelihood and apeGLM prior. """ - no_shrink_index = 1 - shrink_index + p = design_matrix.shape[-1] + + shrink_mask = np.zeros(p) + shrink_mask[shrink_index] = 1 + no_shrink_mask = np.ones(p) - shrink_mask + xbeta = design_matrix @ beta - prior = beta[no_shrink_index] ** 2 / (2 * prior_no_shrink_scale**2) + np.log1p( - (beta[shrink_index] / prior_scale) ** 2 - ) + prior = ( + (beta * no_shrink_mask) ** 2 / (2 * prior_no_shrink_scale**2) + ).sum() + np.log1p((beta[shrink_index] / prior_scale) ** 2) nll = ( counts * xbeta - (counts + size) * np.logaddexp(xbeta + offset, np.log(size)) From 3f688cec8bdce6e0a039ff48f3a7df0bf1e4afe1 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Wed, 4 Jan 2023 14:46:06 +0100 Subject: [PATCH 08/25] fix: convert design_factor to list if a string is provided --- pydeseq2/DeseqDataSet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py index 7184824b..27f2d5f2 100644 --- a/pydeseq2/DeseqDataSet.py +++ b/pydeseq2/DeseqDataSet.py @@ -182,7 +182,9 @@ def __init__( raise ValueError( "The count matrix and clinical data should have the same index." ) - self.design_factor = design_factor + self.design_factor = ( + [design_factor] if isinstance(design_factor, str) else design_factor + ) if self.clinical[self.design_factor].isna().any().any(): raise ValueError("NaNs are not allowed in the design factor.") self.clinical[self.design_factor] = self.clinical[self.design_factor].astype(str) From 5c952f39f0bc822359dd08993b9a5fe92b631c07 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Thu, 5 Jan 2023 11:36:44 +0100 Subject: [PATCH 09/25] fix: correct mu initialisation in multi-factor case --- pydeseq2/DeseqDataSet.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py index 27f2d5f2..c6ec59b0 100644 --- a/pydeseq2/DeseqDataSet.py +++ b/pydeseq2/DeseqDataSet.py @@ -251,6 +251,8 @@ def fit_genewise_dispersions(self): Fits a negative binomial per gene, independently. """ + p = self.design_matrix.shape[-1] + # Check that size factors are available. If not, compute them. if not hasattr(self, "size_factors"): self.fit_size_factors() @@ -269,22 +271,43 @@ def fit_genewise_dispersions(self): rough_disps = self._rough_dispersions.loc[self.non_zero_genes].values size_factors = self.size_factors.values - with parallel_backend("loky", inner_max_num_threads=1): - mu_hat_ = np.array( - Parallel( + if len(self.design_matrix.value_counts()) == p: + with parallel_backend("loky", inner_max_num_threads=1): + mu_hat_ = np.array( + Parallel( + n_jobs=self.n_processes, + verbose=self.joblib_verbosity, + batch_size=self.batch_size, + )( + delayed(fit_lin_mu)( + counts_nonzero[:, i], + size_factors, + X, + self.min_mu, + ) + for i in range(m) + ) + ) + else: + with parallel_backend("loky", inner_max_num_threads=1): + res = Parallel( n_jobs=self.n_processes, verbose=self.joblib_verbosity, batch_size=self.batch_size, )( - delayed(fit_lin_mu)( + delayed(irls_solver)( counts_nonzero[:, i], size_factors, X, + rough_disps[i], self.min_mu, + self.beta_tol, ) for i in range(m) ) - ) + + _, mu_hat_, _, _ = zip(*res) + mu_hat_ = np.array(mu_hat_) self._mu_hat = pd.DataFrame( mu_hat_.T, From e31af6f503a0e90578df8bbc367e87b9916efcb3 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Thu, 5 Jan 2023 11:40:41 +0100 Subject: [PATCH 10/25] fix: fix design matrix building in multifactor case --- pydeseq2/utils.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index 611c4403..d3855a0c 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -73,7 +73,7 @@ def load_data( # Load data if cancer_type == "synthetic": datasets_path = Path(pydeseq2.__file__).parent.parent / "tests" - path_to_data = datasets_path / "data" + path_to_data = datasets_path / "data/single_factor/" if modality == "raw_counts": df = pd.read_csv( @@ -192,7 +192,6 @@ def build_design_matrix( ref = "_".join([factor, np.sort(np.unique(clinical_df[factor]).astype(str))[0]]) ref_level = design_matrix.pop(ref) else: # Put reference level last - ref = "_".join([factor, ref]) try: ref_level = design_matrix.pop(f"{factor}_{ref}") except KeyError as e: @@ -315,7 +314,7 @@ def irls_solver( min_beta=-30, max_beta=30, optimizer="L-BFGS-B", - maxiter=100, + maxiter=250, ): r"""Fit a NB GLM wit log-link to predict counts from the design matrix. @@ -684,6 +683,7 @@ def trimmed_variance(x, trim=0.125, axis=1): return 1.51 * trimmed_mean(sqerror, trim=trim, axis=axis) +# TODO : bug here in multi-factor? def fit_lin_mu(y, size_factors, X, min_mu=0.5): """Estimate mean of negative binomial model using a linear regression. @@ -760,10 +760,8 @@ def wald_test(X, disp, lfc, mu, D, idx=-1): H = np.linalg.inv(M + D) S = H @ M @ H # Evaluate standard error and Wald statistic - c = np.zeros(lfc.shape) - c[idx] = 1 - wald_se = np.sqrt(c.T @ S @ c) - wald_statistic = c.T @ lfc / wald_se + wald_se = np.sqrt(S[idx, idx]) + wald_statistic = lfc[idx] / wald_se wald_p_value = 2 * norm.sf(np.abs(wald_statistic)) return wald_p_value, wald_statistic, wald_se From e850e5e5a98f07de90b2368138f85c54fad3f43f Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Thu, 5 Jan 2023 14:32:26 +0100 Subject: [PATCH 11/25] feat: add multifactor for shrinkage --- pydeseq2/DeseqStats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py index 53cdbb26..cd017b4c 100644 --- a/pydeseq2/DeseqStats.py +++ b/pydeseq2/DeseqStats.py @@ -144,6 +144,7 @@ def __init__( self.n_processes = get_num_processes(n_cpus) self.batch_size = batch_size self.joblib_verbosity = joblib_verbosity + self._change_lfc_sign = False def summary(self): """Run the statistical analysis. @@ -270,7 +271,6 @@ def lfc_shrink(self): """LFC shrinkage with an apeGLM prior [2]_. Shrinks LFCs using a heavy-tailed Cauchy prior, leaving p-values unchanged. - TODO : for now, shrinks the LFCs of the variable given by self.contrast Returns ------- From 416c817458a367c6581eb812e3af220ea8eff759 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Thu, 5 Jan 2023 14:42:21 +0100 Subject: [PATCH 12/25] docs: fix docstrings --- pydeseq2/DeseqStats.py | 2 +- pydeseq2/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py index cd017b4c..f43431aa 100644 --- a/pydeseq2/DeseqStats.py +++ b/pydeseq2/DeseqStats.py @@ -33,7 +33,7 @@ class DeseqStats: A list of three strings, in the following format: ['variable_of_interest', 'tested_level', 'reference_level']. Names must correspond to the clinical data passed to the DeseqDataSet. - (default: None) + (default: None). alpha : float P-value and adjusted p-value significance threshold (usually 0.05). diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index d3855a0c..ed56c46e 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -183,7 +183,7 @@ def build_design_matrix( for factor in design: # Check that each factor has exactly 2 levels if len(np.unique(clinical_df[factor])) != 2: - print( + raise ValueError( f"Factors should take exactly two values, but {factor} " f"takes values {np.unique(clinical_df[factor])}." ) @@ -355,7 +355,7 @@ def irls_solver( maxiter : int Maximum number of IRLS iterations to perform before switching to L-BFGS-B. - (default: 100) + (default: 250). Returns ------- From 0c95c24bed8147090a847e8f97823eaec49b80c1 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Thu, 5 Jan 2023 14:43:55 +0100 Subject: [PATCH 13/25] CI: adapt tests to multi-factor --- .../data/multi_factor/r_test_dispersions.csv | 11 ++ .../multi_factor/r_test_lfc_shrink_res.csv | 11 ++ tests/data/multi_factor/r_test_res.csv | 11 ++ .../data/multi_factor/r_test_size_factors.csv | 101 +++++++++++++++ tests/data/multi_factor/test_clinical.csv | 101 +++++++++++++++ tests/data/multi_factor/test_counts.csv | 11 ++ .../r_test_dispersions.csv | 0 .../r_test_lfc_shrink_res.csv | 0 tests/data/{ => single_factor}/r_test_res.csv | 0 .../r_test_size_factors.csv | 0 .../{ => single_factor}/test_clinical.csv | 0 .../data/{ => single_factor}/test_counts.csv | 0 tests/test_edge_cases.py | 7 +- tests/test_pydeseq2.py | 122 ++++++++++++++++-- 14 files changed, 359 insertions(+), 16 deletions(-) create mode 100644 tests/data/multi_factor/r_test_dispersions.csv create mode 100644 tests/data/multi_factor/r_test_lfc_shrink_res.csv create mode 100644 tests/data/multi_factor/r_test_res.csv create mode 100644 tests/data/multi_factor/r_test_size_factors.csv create mode 100644 tests/data/multi_factor/test_clinical.csv create mode 100644 tests/data/multi_factor/test_counts.csv rename tests/data/{ => single_factor}/r_test_dispersions.csv (100%) rename tests/data/{ => single_factor}/r_test_lfc_shrink_res.csv (100%) rename tests/data/{ => single_factor}/r_test_res.csv (100%) rename tests/data/{ => single_factor}/r_test_size_factors.csv (100%) rename tests/data/{ => single_factor}/test_clinical.csv (100%) rename tests/data/{ => single_factor}/test_counts.csv (100%) diff --git a/tests/data/multi_factor/r_test_dispersions.csv b/tests/data/multi_factor/r_test_dispersions.csv new file mode 100644 index 00000000..605f3f73 --- /dev/null +++ b/tests/data/multi_factor/r_test_dispersions.csv @@ -0,0 +1,11 @@ +"","x" +"gene1",0.571719582382478 +"gene2",0.241375722473924 +"gene3",0.913605156809054 +"gene4",0.127658580621073 +"gene5",0.211606088717701 +"gene6",0.837078542249143 +"gene7",0.274382560982857 +"gene8",0.186512153143256 +"gene9",0.170002121905015 +"gene10",0.342711171818431 diff --git a/tests/data/multi_factor/r_test_lfc_shrink_res.csv b/tests/data/multi_factor/r_test_lfc_shrink_res.csv new file mode 100644 index 00000000..bcd28eda --- /dev/null +++ b/tests/data/multi_factor/r_test_lfc_shrink_res.csv @@ -0,0 +1,11 @@ +"","baseMean","log2FoldChange","lfcSE","pvalue","padj" +"gene1",9.44232157508205,0.389505312989051,0.233070832511343,0.0331949459795556,0.0663898919591112 +"gene2",26.1212486463654,0.516704007917007,0.15435772629484,0.000225787635678376,0.000752625452261255 +"gene3",4.62191544640228,0.401164044699898,0.299289163191974,0.0514994764604093,0.0801582060957779 +"gene4",82.9432080307415,-0.750790557729475,0.110197174492455,1.70031075718514e-12,1.70031075718514e-11 +"gene5",33.4270160403599,0.455162916421329,0.142621472299145,0.000479064424940753,0.00119766106235188 +"gene6",4.62829876327365,0.168302927105627,0.240775492001819,0.334331479132399,0.371479421258221 +"gene7",29.9443838771296,-0.0763398443562887,0.14657708559393,0.558527389642911,0.558527389642911 +"gene8",41.9394265054559,-0.675597885085072,0.13474015798614,9.57931201208725e-08,4.78965600604363e-07 +"gene9",40.5509417857322,-0.217683996486262,0.122943395355584,0.0561107442670445,0.0801582060957779 +"gene10",12.6444689173846,0.224250777853187,0.174611123633433,0.133787774158417,0.167234717698021 diff --git a/tests/data/multi_factor/r_test_res.csv b/tests/data/multi_factor/r_test_res.csv new file mode 100644 index 00000000..ca867cba --- /dev/null +++ b/tests/data/multi_factor/r_test_res.csv @@ -0,0 +1,11 @@ +"","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj" +"gene1",9.44232157508205,0.508071523102971,0.238562875490962,2.12971746780534,0.0331949459795556,0.0663898919591112 +"gene2",26.1212486463654,0.565039762155875,0.153199302487514,3.68826589273752,0.000225787635678376,0.000752625452261255 +"gene3",4.62191544640228,0.602314731765745,0.30930854668438,1.947294176712,0.0514994764604093,0.0801582060957779 +"gene4",82.9432080307415,-0.770740313328172,0.109215102865638,-7.05708544976949,1.70031075718514e-12,1.70031075718514e-11 +"gene5",33.4270160403599,0.49815287339304,0.142647406538608,3.49219719783839,0.000479064424940753,0.00119766106235188 +"gene6",4.62829876327365,0.28690257968296,0.297177180644456,0.965426009698275,0.334331479132399,0.371479421258221 +"gene7",29.9443838771296,-0.0937974387199766,0.160329216758082,-0.585030231024618,0.558527389642911,0.558527389642911 +"gene8",41.9394265054559,-0.711205449520001,0.133321144309024,-5.33452854163556,9.57931201208725e-08,4.78965600604363e-07 +"gene9",40.5509417857322,-0.243736404799899,0.127599022943467,-1.91017453878065,0.0561107442670445,0.0801582060957779 +"gene10",12.6444689173846,0.2817116030926,0.187891530240353,1.49933103813797,0.133787774158417,0.167234717698021 diff --git a/tests/data/multi_factor/r_test_size_factors.csv b/tests/data/multi_factor/r_test_size_factors.csv new file mode 100644 index 00000000..8d60910e --- /dev/null +++ b/tests/data/multi_factor/r_test_size_factors.csv @@ -0,0 +1,101 @@ +"","x" +"sample1",0.94383204955008 +"sample2",1.04976550254112 +"sample3",0.911638462733079 +"sample4",1.29876776473799 +"sample5",0.679595312414821 +"sample6",0.764054516302446 +"sample7",0.801706248933874 +"sample8",1.1696677607208 +"sample9",1.00580106237394 +"sample10",0.999052126721534 +"sample11",0.988776432861989 +"sample12",1.25484456354867 +"sample13",0.857756357861922 +"sample14",0.842698187394378 +"sample15",0.779778507147202 +"sample16",1.28385972471753 +"sample17",0.994514686617904 +"sample18",1.79349056643856 +"sample19",1.32455815041249 +"sample20",0.752607424834414 +"sample21",0.731992662070059 +"sample22",1.87146841715328 +"sample23",1.29876776473799 +"sample24",0.974723133934002 +"sample25",0.788330562401193 +"sample26",1.48157916357968 +"sample27",0.779778507147202 +"sample28",1.22327156234668 +"sample29",1.19608774985009 +"sample30",1.07726160809632 +"sample31",0.89914691404938 +"sample32",1.09168991000608 +"sample33",0.801706248933874 +"sample34",0.605037341533548 +"sample35",1.40360131286496 +"sample36",1.19608774985009 +"sample37",1.16855396610962 +"sample38",1.28385972471753 +"sample39",0.828762238848253 +"sample40",0.898887666238171 +"sample41",1.33200681233305 +"sample42",1.11453631236031 +"sample43",1.22327156234668 +"sample44",0.94383204955008 +"sample45",0.89914691404938 +"sample46",0.994514686617904 +"sample47",0.95143343738075 +"sample48",1.19886255206584 +"sample49",0.801136830886645 +"sample50",1.70788656585253 +"sample51",0.697135868638152 +"sample52",1.02214009457951 +"sample53",1.07866519948581 +"sample54",1.46414662196525 +"sample55",0.764054516302446 +"sample56",2.32053426877511 +"sample57",1.30482299983646 +"sample58",0.89914691404938 +"sample59",0.799241701377227 +"sample60",1.4985781900823 +"sample61",0.853943282926263 +"sample62",1.22327156234668 +"sample63",0.911638462733079 +"sample64",1.39867297741015 +"sample65",1.12360958279771 +"sample66",1.09895733939369 +"sample67",0.857756357861922 +"sample68",1.1417201248569 +"sample69",0.896745283219282 +"sample70",0.974723133934002 +"sample71",1.21349834942153 +"sample72",1.11453631236031 +"sample73",0.733962937408007 +"sample74",0.71826060700182 +"sample75",1.04570380295723 +"sample76",0.731992662070059 +"sample77",0.605037341533548 +"sample78",0.815514374897786 +"sample79",1.65821256229216 +"sample80",0.773511422925036 +"sample81",1.46094382467857 +"sample82",0.988776432861989 +"sample83",0.801706248933874 +"sample84",0.699336488705073 +"sample85",0.94383204955008 +"sample86",0.71826060700182 +"sample87",0.799241701377227 +"sample88",1.04976550254112 +"sample89",0.924249624884157 +"sample90",1.28970135698058 +"sample91",0.911638462733079 +"sample92",0.975990216093412 +"sample93",0.857756357861922 +"sample94",0.699336488705073 +"sample95",0.801136830886645 +"sample96",1.61799779922871 +"sample97",1.20865668607816 +"sample98",0.974723133934002 +"sample99",0.557708694910521 +"sample100",1.25844273273344 diff --git a/tests/data/multi_factor/test_clinical.csv b/tests/data/multi_factor/test_clinical.csv new file mode 100644 index 00000000..71a24451 --- /dev/null +++ b/tests/data/multi_factor/test_clinical.csv @@ -0,0 +1,101 @@ +"","condition","group" +"sample1","A","X" +"sample2","A","Y" +"sample3","A","X" +"sample4","A","Y" +"sample5","A","X" +"sample6","A","Y" +"sample7","A","X" +"sample8","A","Y" +"sample9","A","X" +"sample10","A","Y" +"sample11","A","X" +"sample12","A","Y" +"sample13","A","X" +"sample14","A","Y" +"sample15","A","X" +"sample16","A","Y" +"sample17","A","X" +"sample18","A","Y" +"sample19","A","X" +"sample20","A","Y" +"sample21","A","X" +"sample22","A","Y" +"sample23","A","X" +"sample24","A","Y" +"sample25","A","X" +"sample26","A","Y" +"sample27","A","X" +"sample28","A","Y" +"sample29","A","X" +"sample30","A","Y" +"sample31","A","X" +"sample32","A","Y" +"sample33","A","X" +"sample34","A","Y" +"sample35","A","X" +"sample36","A","Y" +"sample37","A","X" +"sample38","A","Y" +"sample39","A","X" +"sample40","A","Y" +"sample41","A","X" +"sample42","A","Y" +"sample43","A","X" +"sample44","A","Y" +"sample45","A","X" +"sample46","A","Y" +"sample47","A","X" +"sample48","A","Y" +"sample49","A","X" +"sample50","A","Y" +"sample51","B","X" +"sample52","B","Y" +"sample53","B","X" +"sample54","B","Y" +"sample55","B","X" +"sample56","B","Y" +"sample57","B","X" +"sample58","B","Y" +"sample59","B","X" +"sample60","B","Y" +"sample61","B","X" +"sample62","B","Y" +"sample63","B","X" +"sample64","B","Y" +"sample65","B","X" +"sample66","B","Y" +"sample67","B","X" +"sample68","B","Y" +"sample69","B","X" +"sample70","B","Y" +"sample71","B","X" +"sample72","B","Y" +"sample73","B","X" +"sample74","B","Y" +"sample75","B","X" +"sample76","B","Y" +"sample77","B","X" +"sample78","B","Y" +"sample79","B","X" +"sample80","B","Y" +"sample81","B","X" +"sample82","B","Y" +"sample83","B","X" +"sample84","B","Y" +"sample85","B","X" +"sample86","B","Y" +"sample87","B","X" +"sample88","B","Y" +"sample89","B","X" +"sample90","B","Y" +"sample91","B","X" +"sample92","B","Y" +"sample93","B","X" +"sample94","B","Y" +"sample95","B","X" +"sample96","B","Y" +"sample97","B","X" +"sample98","B","Y" +"sample99","B","X" +"sample100","B","Y" diff --git a/tests/data/multi_factor/test_counts.csv b/tests/data/multi_factor/test_counts.csv new file mode 100644 index 00000000..538ecd7c --- /dev/null +++ b/tests/data/multi_factor/test_counts.csv @@ -0,0 +1,11 @@ +"","sample1","sample2","sample3","sample4","sample5","sample6","sample7","sample8","sample9","sample10","sample11","sample12","sample13","sample14","sample15","sample16","sample17","sample18","sample19","sample20","sample21","sample22","sample23","sample24","sample25","sample26","sample27","sample28","sample29","sample30","sample31","sample32","sample33","sample34","sample35","sample36","sample37","sample38","sample39","sample40","sample41","sample42","sample43","sample44","sample45","sample46","sample47","sample48","sample49","sample50","sample51","sample52","sample53","sample54","sample55","sample56","sample57","sample58","sample59","sample60","sample61","sample62","sample63","sample64","sample65","sample66","sample67","sample68","sample69","sample70","sample71","sample72","sample73","sample74","sample75","sample76","sample77","sample78","sample79","sample80","sample81","sample82","sample83","sample84","sample85","sample86","sample87","sample88","sample89","sample90","sample91","sample92","sample93","sample94","sample95","sample96","sample97","sample98","sample99","sample100" +"gene1",12,1,0,3,1,12,0,22,15,8,5,9,18,10,29,2,2,7,0,6,5,12,11,12,4,17,11,2,1,15,13,8,2,4,17,6,9,4,0,18,5,4,7,1,8,3,6,5,9,14,14,13,5,9,2,14,24,11,4,7,6,4,2,3,11,5,12,11,21,5,6,32,15,7,15,23,7,33,24,17,4,5,16,7,9,33,4,5,9,11,9,7,5,10,2,8,6,6,3,2 +"gene2",21,47,9,16,10,17,13,30,37,9,22,16,11,41,13,32,10,21,17,10,14,26,12,10,23,37,26,17,37,15,12,14,8,20,16,21,26,9,18,20,62,16,41,21,27,20,39,28,21,38,36,33,24,39,17,59,37,39,26,29,19,47,31,57,25,18,2,15,26,26,27,11,13,68,33,58,42,30,46,20,12,22,7,6,21,21,25,40,48,38,24,33,17,51,31,36,32,10,42,28 +"gene3",4,2,4,1,11,4,5,1,8,5,7,0,10,10,7,8,3,0,6,1,0,1,3,5,3,2,2,4,1,0,0,4,3,2,2,1,12,4,10,3,1,7,0,1,0,3,3,1,3,0,31,0,1,11,2,7,2,2,10,8,0,8,11,2,4,12,4,11,10,1,4,10,1,0,1,4,9,1,18,3,2,1,7,6,3,12,0,3,2,7,0,2,0,0,5,13,12,2,0,5 +"gene4",130,43,119,42,78,80,97,55,150,45,177,104,70,62,141,87,205,122,96,51,134,57,172,57,160,50,71,52,197,73,265,35,120,41,103,51,219,87,199,60,108,57,135,44,173,67,179,95,141,98,57,22,80,50,84,41,125,33,80,38,108,32,61,30,64,50,45,15,98,8,94,54,65,26,58,29,41,50,110,44,99,40,71,29,112,47,97,48,32,22,119,50,104,38,52,33,77,62,25,43 +"gene5",18,14,34,50,18,16,23,16,20,72,23,36,15,71,16,26,4,74,38,18,21,58,10,27,30,60,21,43,14,50,22,25,23,15,17,30,22,16,39,30,29,42,21,11,23,62,23,12,13,44,20,57,40,21,26,72,13,57,12,79,43,40,20,13,47,65,57,63,10,44,52,67,20,33,30,21,23,58,76,91,44,33,23,58,23,36,21,64,8,37,21,28,27,23,38,48,14,73,16,51 +"gene6",0,10,9,1,0,2,2,5,0,0,7,16,7,0,6,12,1,18,2,12,7,0,9,2,0,2,0,0,0,2,6,3,2,8,3,3,11,0,4,1,1,6,2,2,0,9,6,1,8,2,6,0,6,3,1,4,4,4,4,2,12,1,5,2,10,0,0,4,4,4,3,17,9,5,1,2,5,4,4,10,1,2,7,1,4,4,3,1,6,1,8,8,3,8,2,6,18,9,5,6 +"gene7",16,70,20,46,10,18,9,30,15,24,47,51,22,9,20,26,8,46,26,41,39,48,18,25,15,38,20,18,17,29,25,28,39,83,36,61,44,61,14,32,58,21,27,46,46,18,47,22,12,52,33,11,15,17,16,27,27,9,48,53,15,12,27,53,19,69,22,42,23,25,25,21,55,22,27,11,48,19,24,14,22,13,46,13,25,38,23,11,71,41,17,25,22,42,12,45,31,25,6,37 +"gene8",54,38,33,75,51,58,21,88,62,103,8,41,55,11,55,17,36,72,53,87,16,69,96,95,26,35,14,104,69,102,31,49,14,121,70,83,63,60,30,56,31,51,52,62,28,36,18,62,29,124,10,37,25,53,20,84,46,32,26,55,9,21,33,41,38,15,28,41,35,33,41,55,12,26,44,28,13,45,29,28,41,25,20,54,23,26,19,38,38,36,33,56,15,16,29,44,19,29,14,26 +"gene9",49,65,32,37,25,25,48,38,37,116,36,86,30,31,24,80,46,43,53,69,24,61,66,56,29,57,12,45,44,38,35,99,74,9,60,44,28,52,19,18,49,41,45,94,24,38,35,42,59,35,14,35,41,57,25,50,48,66,26,35,18,45,12,56,55,80,53,42,12,64,52,41,27,10,19,22,18,30,61,12,72,46,23,43,50,24,23,10,34,26,18,69,27,25,40,61,58,52,25,44 +"gene10",3,10,13,13,9,24,19,25,7,10,14,9,19,2,11,15,10,15,7,6,10,21,13,11,6,1,21,21,2,7,9,48,7,4,3,20,10,11,18,1,6,17,6,6,9,15,7,12,2,26,2,38,3,20,23,38,29,9,8,15,29,15,14,14,18,11,9,14,1,7,12,20,21,7,8,8,4,8,19,29,16,10,15,7,3,4,8,16,4,30,31,6,12,7,8,12,17,20,10,15 diff --git a/tests/data/r_test_dispersions.csv b/tests/data/single_factor/r_test_dispersions.csv similarity index 100% rename from tests/data/r_test_dispersions.csv rename to tests/data/single_factor/r_test_dispersions.csv diff --git a/tests/data/r_test_lfc_shrink_res.csv b/tests/data/single_factor/r_test_lfc_shrink_res.csv similarity index 100% rename from tests/data/r_test_lfc_shrink_res.csv rename to tests/data/single_factor/r_test_lfc_shrink_res.csv diff --git a/tests/data/r_test_res.csv b/tests/data/single_factor/r_test_res.csv similarity index 100% rename from tests/data/r_test_res.csv rename to tests/data/single_factor/r_test_res.csv diff --git a/tests/data/r_test_size_factors.csv b/tests/data/single_factor/r_test_size_factors.csv similarity index 100% rename from tests/data/r_test_size_factors.csv rename to tests/data/single_factor/r_test_size_factors.csv diff --git a/tests/data/test_clinical.csv b/tests/data/single_factor/test_clinical.csv similarity index 100% rename from tests/data/test_clinical.csv rename to tests/data/single_factor/test_clinical.csv diff --git a/tests/data/test_counts.csv b/tests/data/single_factor/test_counts.csv similarity index 100% rename from tests/data/test_counts.csv rename to tests/data/single_factor/test_counts.csv diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py index febbb294..4e85b9f3 100644 --- a/tests/test_edge_cases.py +++ b/tests/test_edge_cases.py @@ -18,10 +18,10 @@ def test_zero_genes(): test_path = str(Path(os.path.realpath(tests.__file__)).parent.resolve()) counts_df = pd.read_csv( - os.path.join(test_path, "data/test_counts.csv"), index_col=0 + os.path.join(test_path, "data/single_factor/test_counts.csv"), index_col=0 ).T clinical_df = pd.read_csv( - os.path.join(test_path, "data/test_clinical.csv"), index_col=0 + os.path.join(test_path, "data/single_factor/test_clinical.csv"), index_col=0 ) n, m = counts_df.shape @@ -40,7 +40,8 @@ def test_zero_genes(): assert dds.LFCs.loc[zero_genes].isna().all().all() res = DeseqStats(dds) - res_df = res.summary() + res.summary() + res_df = res.results_df # check that the corresponding stats are NaN assert (res_df.loc[zero_genes].baseMean == 0).all() diff --git a/tests/test_pydeseq2.py b/tests/test_pydeseq2.py index 77d0b67b..08a161e1 100644 --- a/tests/test_pydeseq2.py +++ b/tests/test_pydeseq2.py @@ -8,6 +8,8 @@ from pydeseq2.DeseqDataSet import DeseqDataSet from pydeseq2.DeseqStats import DeseqStats +# Single-factor tests + def test_deseq(tol=0.02): """Test that the outputs of the DESeq2 function match those of the original R @@ -17,19 +19,22 @@ def test_deseq(tol=0.02): test_path = str(Path(os.path.realpath(tests.__file__)).parent.resolve()) counts_df = pd.read_csv( - os.path.join(test_path, "data/test_counts.csv"), index_col=0 + os.path.join(test_path, "data/single_factor/test_counts.csv"), index_col=0 ).T clinical_df = pd.read_csv( - os.path.join(test_path, "data/test_clinical.csv"), index_col=0 + os.path.join(test_path, "data/single_factor/test_clinical.csv"), index_col=0 ) - r_res = pd.read_csv(os.path.join(test_path, "data/r_test_res.csv"), index_col=0) + r_res = pd.read_csv( + os.path.join(test_path, "data/single_factor/r_test_res.csv"), index_col=0 + ) dds = DeseqDataSet(counts_df, clinical_df, design_factor="condition") dds.deseq2() res = DeseqStats(dds) - res_df = res.summary() + res.summary() + res_df = res.results_df # check that the same p-values are NaN assert (res_df.pvalue.isna() == r_res.pvalue.isna()).all() @@ -49,25 +54,29 @@ def test_lfc_shrinkage(tol=0.02): """ test_path = str(Path(os.path.realpath(tests.__file__)).parent.resolve()) - r_res = pd.read_csv(os.path.join(test_path, "data/r_test_res.csv"), index_col=0) + r_res = pd.read_csv( + os.path.join(test_path, "data/single_factor/r_test_res.csv"), index_col=0 + ) r_shrunk_res = pd.read_csv( - os.path.join(test_path, "data/r_test_lfc_shrink_res.csv"), index_col=0 + os.path.join(test_path, "data/single_factor/r_test_lfc_shrink_res.csv"), + index_col=0, ) counts_df = pd.read_csv( - os.path.join(test_path, "data/test_counts.csv"), index_col=0 + os.path.join(test_path, "data/single_factor/test_counts.csv"), index_col=0 ).T clinical_df = pd.read_csv( - os.path.join(test_path, "data/test_clinical.csv"), index_col=0 + os.path.join(test_path, "data/single_factor/test_clinical.csv"), index_col=0 ) r_size_factors = pd.read_csv( - os.path.join(test_path, "data/r_test_size_factors.csv"), index_col=0 + os.path.join(test_path, "data/single_factor/r_test_size_factors.csv"), + index_col=0, ).squeeze() r_dispersions = pd.read_csv( - os.path.join(test_path, "data/r_test_dispersions.csv"), index_col=0 + os.path.join(test_path, "data/single_factor/r_test_dispersions.csv"), index_col=0 ).squeeze() dds = DeseqDataSet(counts_df, clinical_df, design_factor="condition") @@ -79,14 +88,101 @@ def test_lfc_shrinkage(tol=0.02): res = DeseqStats(dds) res.summary() res.SE = r_res.lfcSE * np.log(2) - shrunk_res = res.lfc_shrink() + res.lfc_shrink() + shrunk_res = res.results_df - # Check that the same LFC and lfcSE are found (up to tol) + # Check that the same LFC are found (up to tol) assert ( abs(r_shrunk_res.log2FoldChange - shrunk_res.log2FoldChange) / abs(r_shrunk_res.log2FoldChange) ).max() < tol + +# Multi-factor tests + + +def test_multifactor_deseq(tol=0.02): + """Test that the outputs of the DESeq2 function match those of the original R + package, up to a tolerance in relative error. + """ + + test_path = str(Path(os.path.realpath(tests.__file__)).parent.resolve()) + + counts_df = pd.read_csv( + os.path.join(test_path, "data/multi_factor/test_counts.csv"), index_col=0 + ).T + clinical_df = pd.read_csv( + os.path.join(test_path, "data/multi_factor/test_clinical.csv"), index_col=0 + ) + + r_res = pd.read_csv( + os.path.join(test_path, "data/multi_factor/r_test_res.csv"), index_col=0 + ) + + dds = DeseqDataSet(counts_df, clinical_df, design_factor=["group", "condition"]) + dds.deseq2() + + res = DeseqStats(dds) + res.summary() + res_df = res.results_df + + # check that the same p-values are NaN + assert (res_df.pvalue.isna() == r_res.pvalue.isna()).all() + assert (res_df.padj.isna() == r_res.padj.isna()).all() + + # Check that the same LFC, p-values and adjusted p-values are found (up to tol) assert ( - abs(r_shrunk_res.lfcSE - shrunk_res.lfcSE) / abs(r_shrunk_res.lfcSE) + abs(r_res.log2FoldChange - res_df.log2FoldChange) / abs(r_res.log2FoldChange) + ).max() < tol + assert (abs(r_res.pvalue - res_df.pvalue) / r_res.pvalue).max() < tol + assert (abs(r_res.padj - res_df.padj) / r_res.padj).max() < tol + + +def test_multifactor_lfc_shrinkage(tol=0.02): + """Test that the outputs of the lfc_shrink function match those of the original + R package (starting from the same inputs), up to a tolerance in relative error. + """ + + test_path = str(Path(os.path.realpath(tests.__file__)).parent.resolve()) + r_res = pd.read_csv( + os.path.join(test_path, "data/multi_factor/r_test_res.csv"), index_col=0 + ) + r_shrunk_res = pd.read_csv( + os.path.join(test_path, "data/multi_factor/r_test_lfc_shrink_res.csv"), + index_col=0, + ) + + counts_df = pd.read_csv( + os.path.join(test_path, "data/multi_factor/test_counts.csv"), index_col=0 + ).T + + clinical_df = pd.read_csv( + os.path.join(test_path, "data/multi_factor/test_clinical.csv"), index_col=0 + ) + + r_size_factors = pd.read_csv( + os.path.join(test_path, "data/multi_factor/r_test_size_factors.csv"), + index_col=0, + ).squeeze() + + r_dispersions = pd.read_csv( + os.path.join(test_path, "data/multi_factor/r_test_dispersions.csv"), index_col=0 + ).squeeze() + + dds = DeseqDataSet(counts_df, clinical_df, design_factor=["group", "condition"]) + dds.deseq2() + dds.size_factors = r_size_factors + dds.dispersions = r_dispersions + dds.LFCs.iloc[:, 1] = r_res.log2FoldChange.values * np.log(2) + + res = DeseqStats(dds) + res.summary() + res.SE = r_res.lfcSE * np.log(2) + res.lfc_shrink() + shrunk_res = res.results_df + + # Check that the same LFC found (up to tol) + assert ( + abs(r_shrunk_res.log2FoldChange - shrunk_res.log2FoldChange) + / abs(r_shrunk_res.log2FoldChange) ).max() < tol From 3b5f59d7824cdba8d3cb93a185174868e18edbf6 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Thu, 5 Jan 2023 15:47:53 +0100 Subject: [PATCH 14/25] docs: update docstrings and add line comments --- pydeseq2/DeseqDataSet.py | 11 +++++++++-- pydeseq2/DeseqStats.py | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py index c6ec59b0..a612299a 100644 --- a/pydeseq2/DeseqDataSet.py +++ b/pydeseq2/DeseqDataSet.py @@ -46,7 +46,7 @@ class DeseqDataSet: design_factor : str or list[str] Name of the columns of clinical to be used as design variables. If a list, the last factor will be considered the variable of interest by default. - (default: 'high_grade'). + Only bi-level factors are supported. (default: 'high_grade'). reference_level : str The factor to use as a reference. Must be one of the values taken by the design. @@ -182,14 +182,17 @@ def __init__( raise ValueError( "The count matrix and clinical data should have the same index." ) + + # Convert design factor to list if a single string was provided. self.design_factor = ( [design_factor] if isinstance(design_factor, str) else design_factor ) + if self.clinical[self.design_factor].isna().any().any(): raise ValueError("NaNs are not allowed in the design factor.") self.clinical[self.design_factor] = self.clinical[self.design_factor].astype(str) - # Build the design matrix (splits the dataset in two cohorts) + # Build the design matrix (splits the dataset in cohorts) self.design_matrix = build_design_matrix( self.clinical, design_factor, @@ -271,6 +274,10 @@ def fit_genewise_dispersions(self): rough_disps = self._rough_dispersions.loc[self.non_zero_genes].values size_factors = self.size_factors.values + # mu_hat is initialized differently depending on the number of different factor + # groups. If there are as many different factor combinations as design factors + # (intercept included), it is fitted with a linear model, otherwise it is fitted + # with a GLM (using rough dispersion estimates). if len(self.design_matrix.value_counts()) == p: with parallel_backend("loky", inner_max_num_threads=1): mu_hat_ = np.array( diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py index f43431aa..0f8d0aba 100644 --- a/pydeseq2/DeseqStats.py +++ b/pydeseq2/DeseqStats.py @@ -123,7 +123,7 @@ def __init__( # Build contrast if None if contrast is not None: - # TODO : tests on the constrast + # TODO : tests on the contrast self.contrast = contrast else: factor = self.dds.design_factor[-1] From 1963663e8cfe6398539966de3bf5d6cfc2249e2b Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Thu, 5 Jan 2023 15:51:26 +0100 Subject: [PATCH 15/25] refactor: change design_factor to design_factors --- notebooks/PyDESeq2_minimal_example.ipynb | 2 +- .../PyDESeq2_step_by_step_pipeline.ipynb | 2 +- pydeseq2/DeseqDataSet.py | 18 +++++++++-------- pydeseq2/DeseqStats.py | 2 +- tests/test_edge_cases.py | 20 +++++++++---------- tests/test_pydeseq2.py | 8 ++++---- 6 files changed, 27 insertions(+), 25 deletions(-) diff --git a/notebooks/PyDESeq2_minimal_example.ipynb b/notebooks/PyDESeq2_minimal_example.ipynb index b3852938..fbb201d4 100644 --- a/notebooks/PyDESeq2_minimal_example.ipynb +++ b/notebooks/PyDESeq2_minimal_example.ipynb @@ -198,7 +198,7 @@ "dds = DeseqDataSet(\n", " counts_df,\n", " clinical_df,\n", - " design_factor=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n", + " design_factors=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n", " refit_cooks=True,\n", " n_cpus=8,\n", ")" diff --git a/notebooks/PyDESeq2_step_by_step_pipeline.ipynb b/notebooks/PyDESeq2_step_by_step_pipeline.ipynb index 41243d1d..b338ab8b 100644 --- a/notebooks/PyDESeq2_step_by_step_pipeline.ipynb +++ b/notebooks/PyDESeq2_step_by_step_pipeline.ipynb @@ -196,7 +196,7 @@ "dds = DeseqDataSet(\n", " counts_df,\n", " clinical_df,\n", - " design_factor=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n", + " design_factors=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n", " refit_cooks=True,\n", " n_cpus=8,\n", ")" diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py index a612299a..77b86fe8 100644 --- a/pydeseq2/DeseqDataSet.py +++ b/pydeseq2/DeseqDataSet.py @@ -43,7 +43,7 @@ class DeseqDataSet: DataFrame containing clinical information. Must be indexed by sample barcodes. - design_factor : str or list[str] + design_factors : str or list[str] Name of the columns of clinical to be used as design variables. If a list, the last factor will be considered the variable of interest by default. Only bi-level factors are supported. (default: 'high_grade'). @@ -156,7 +156,7 @@ def __init__( self, counts, clinical, - design_factor="high_grade", + design_factors="high_grade", reference_level=None, min_mu=0.5, min_disp=1e-8, @@ -184,18 +184,20 @@ def __init__( ) # Convert design factor to list if a single string was provided. - self.design_factor = ( - [design_factor] if isinstance(design_factor, str) else design_factor + self.design_factors = ( + [design_factors] if isinstance(design_factors, str) else design_factors ) - if self.clinical[self.design_factor].isna().any().any(): + if self.clinical[self.design_factors].isna().any().any(): raise ValueError("NaNs are not allowed in the design factor.") - self.clinical[self.design_factor] = self.clinical[self.design_factor].astype(str) + self.clinical[self.design_factors] = self.clinical[self.design_factors].astype( + str + ) # Build the design matrix (splits the dataset in cohorts) self.design_matrix = build_design_matrix( self.clinical, - design_factor, + design_factors, ref=reference_level, expanded=False, intercept=True, @@ -704,7 +706,7 @@ def _refit_without_outliers( sub_dds = DeseqDataSet( counts=self.counts_to_refit, clinical=self.clinical, - design_factor=self.design_factor, + design_factors=self.design_factors, min_mu=self.min_mu, min_disp=self.min_disp, max_disp=self.max_disp, diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py index 0f8d0aba..f0ba54ed 100644 --- a/pydeseq2/DeseqStats.py +++ b/pydeseq2/DeseqStats.py @@ -126,7 +126,7 @@ def __init__( # TODO : tests on the contrast self.contrast = contrast else: - factor = self.dds.design_factor[-1] + factor = self.dds.design_factors[-1] levels = np.unique(self.dds.clinical[factor]).astype(str) if "_".join([factor, levels[0]]) == self.dds.design_matrix.columns[-1]: self.contrast = [factor, levels[0], levels[1]] diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py index 4e85b9f3..d06bcd60 100644 --- a/tests/test_edge_cases.py +++ b/tests/test_edge_cases.py @@ -32,7 +32,7 @@ def test_zero_genes(): counts_df[zero_genes] = 0 # Run analysis - dds = DeseqDataSet(counts_df, clinical_df, design_factor="condition") + dds = DeseqDataSet(counts_df, clinical_df, design_factors="condition") dds.deseq2() # check that the corresponding parameters are NaN @@ -59,7 +59,7 @@ def test_nan_counts(): clinical_df = pd.DataFrame({"condition": [0, 1]}) with pytest.raises(ValueError): - DeseqDataSet(counts_df, clinical_df, design_factor="condition") + DeseqDataSet(counts_df, clinical_df, design_factors="condition") def test_numeric_counts(): @@ -70,7 +70,7 @@ def test_numeric_counts(): clinical_df = pd.DataFrame({"condition": [0, 1]}) with pytest.raises(ValueError): - DeseqDataSet(counts_df, clinical_df, design_factor="condition") + DeseqDataSet(counts_df, clinical_df, design_factors="condition") def test_integer_counts(): @@ -80,7 +80,7 @@ def test_integer_counts(): clinical_df = pd.DataFrame({"condition": [0, 1]}) with pytest.raises(ValueError): - DeseqDataSet(counts_df, clinical_df, design_factor="condition") + DeseqDataSet(counts_df, clinical_df, design_factors="condition") def test_non_negative_counts(): @@ -90,7 +90,7 @@ def test_non_negative_counts(): clinical_df = pd.DataFrame({"condition": [0, 1]}) with pytest.raises(ValueError): - DeseqDataSet(counts_df, clinical_df, design_factor="condition") + DeseqDataSet(counts_df, clinical_df, design_factors="condition") # Tests on the clinical data (design factors) @@ -100,7 +100,7 @@ def test_nan_factors(): clinical_df = pd.DataFrame({"condition": [0, np.NaN]}) with pytest.raises(ValueError): - DeseqDataSet(counts_df, clinical_df, design_factor="condition") + DeseqDataSet(counts_df, clinical_df, design_factors="condition") def test_one_factors(): @@ -109,7 +109,7 @@ def test_one_factors(): clinical_df = pd.DataFrame({"condition": [0, 0]}) with pytest.raises(ValueError): - DeseqDataSet(counts_df, clinical_df, design_factor="condition") + DeseqDataSet(counts_df, clinical_df, design_factors="condition") def test_too_many_factors(): @@ -119,7 +119,7 @@ def test_too_many_factors(): clinical_df = pd.DataFrame({"condition": [0, 1, 2]}) with pytest.raises(ValueError): - DeseqDataSet(counts_df, clinical_df, design_factor="condition") + DeseqDataSet(counts_df, clinical_df, design_factors="condition") def test_reference_level(): @@ -130,7 +130,7 @@ def test_reference_level(): with pytest.raises(KeyError): DeseqDataSet( - counts_df, clinical_df, design_factor="condition", reference_level="control" + counts_df, clinical_df, design_factors="condition", reference_level="control" ) @@ -146,5 +146,5 @@ def test_indexes(): DeseqDataSet( counts_df, clinical_df, - design_factor="condition", + design_factors="condition", ) diff --git a/tests/test_pydeseq2.py b/tests/test_pydeseq2.py index 08a161e1..c6e4cdef 100644 --- a/tests/test_pydeseq2.py +++ b/tests/test_pydeseq2.py @@ -29,7 +29,7 @@ def test_deseq(tol=0.02): os.path.join(test_path, "data/single_factor/r_test_res.csv"), index_col=0 ) - dds = DeseqDataSet(counts_df, clinical_df, design_factor="condition") + dds = DeseqDataSet(counts_df, clinical_df, design_factors="condition") dds.deseq2() res = DeseqStats(dds) @@ -79,7 +79,7 @@ def test_lfc_shrinkage(tol=0.02): os.path.join(test_path, "data/single_factor/r_test_dispersions.csv"), index_col=0 ).squeeze() - dds = DeseqDataSet(counts_df, clinical_df, design_factor="condition") + dds = DeseqDataSet(counts_df, clinical_df, design_factors="condition") dds.deseq2() dds.size_factors = r_size_factors dds.dispersions = r_dispersions @@ -119,7 +119,7 @@ def test_multifactor_deseq(tol=0.02): os.path.join(test_path, "data/multi_factor/r_test_res.csv"), index_col=0 ) - dds = DeseqDataSet(counts_df, clinical_df, design_factor=["group", "condition"]) + dds = DeseqDataSet(counts_df, clinical_df, design_factors=["group", "condition"]) dds.deseq2() res = DeseqStats(dds) @@ -169,7 +169,7 @@ def test_multifactor_lfc_shrinkage(tol=0.02): os.path.join(test_path, "data/multi_factor/r_test_dispersions.csv"), index_col=0 ).squeeze() - dds = DeseqDataSet(counts_df, clinical_df, design_factor=["group", "condition"]) + dds = DeseqDataSet(counts_df, clinical_df, design_factors=["group", "condition"]) dds.deseq2() dds.size_factors = r_size_factors dds.dispersions = r_dispersions From ae19ca27ca0b589645ef5971028d8d8f461103b8 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Thu, 5 Jan 2023 15:52:43 +0100 Subject: [PATCH 16/25] docs: update README --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b92cb484..c7b27074 100644 --- a/README.md +++ b/README.md @@ -24,8 +24,9 @@ It aims to facilitate DEA experiments for python users. As PyDESeq2 is a re-implementation of [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) from scratch, you may experience some differences in terms of retrieved values or available features. -Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor analysis, -but we plan to implement more in the near future. In case there is a feature you would particularly like to be implemented, feel free to open an issue. +Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor +and paired multi-factor analysis, but we plan to implement more in the near future. +In case there is a feature you would particularly like to be implemented, feel free to open an issue. ## Installation From a7dc5fe063839de0f4efd68cd999ec674517f4ae Mon Sep 17 00:00:00 2001 From: Boris Muzellec Date: Mon, 9 Jan 2023 10:24:13 +0100 Subject: [PATCH 17/25] docs: update docstring (list of strings argument) Co-authored-by: Maria Telenczuk --- pydeseq2/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index ed56c46e..a9f1ab27 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -152,7 +152,7 @@ def build_design_matrix( DataFrame containing clinical information. Must be indexed by sample barcodes, and contain a "high_grade" column. - design : str or list + design : str or list[str] Name of the columns of clinical_df to be used as design_matrix variables. (default: "high_grade"). From e5364518043f4edf9dccaf314ad8020bc7ba2b87 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 9 Jan 2023 10:51:46 +0100 Subject: [PATCH 18/25] refactor: change single-letter variable names to more informative ones --- pydeseq2/DeseqDataSet.py | 34 +++++++++++++-------------- pydeseq2/DeseqStats.py | 24 ++++++++++--------- pydeseq2/utils.py | 51 +++++++++++++++++++++------------------- 3 files changed, 57 insertions(+), 52 deletions(-) diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py index 77b86fe8..4e4d0038 100644 --- a/pydeseq2/DeseqDataSet.py +++ b/pydeseq2/DeseqDataSet.py @@ -256,7 +256,7 @@ def fit_genewise_dispersions(self): Fits a negative binomial per gene, independently. """ - p = self.design_matrix.shape[-1] + num_vars = self.design_matrix.shape[-1] # Check that size factors are available. If not, compute them. if not hasattr(self, "size_factors"): @@ -269,7 +269,7 @@ def fit_genewise_dispersions(self): non_zero = ~(self.counts == 0).all() self.non_zero_genes = non_zero[non_zero].index counts_nonzero = self.counts.loc[:, self.non_zero_genes].values - m = counts_nonzero.shape[1] + num_genes = counts_nonzero.shape[1] # Convert design_matrix to numpy for speed X = self.design_matrix.values @@ -280,7 +280,7 @@ def fit_genewise_dispersions(self): # groups. If there are as many different factor combinations as design factors # (intercept included), it is fitted with a linear model, otherwise it is fitted # with a GLM (using rough dispersion estimates). - if len(self.design_matrix.value_counts()) == p: + if len(self.design_matrix.value_counts()) == num_vars: with parallel_backend("loky", inner_max_num_threads=1): mu_hat_ = np.array( Parallel( @@ -294,7 +294,7 @@ def fit_genewise_dispersions(self): X, self.min_mu, ) - for i in range(m) + for i in range(num_genes) ) ) else: @@ -312,7 +312,7 @@ def fit_genewise_dispersions(self): self.min_mu, self.beta_tol, ) - for i in range(m) + for i in range(num_genes) ) _, mu_hat_, _, _ = zip(*res) @@ -336,7 +336,7 @@ def fit_genewise_dispersions(self): self.min_disp, self.max_disp, ) - for i in range(m) + for i in range(num_genes) ) end = time.time() print(f"... done in {end - start:.2f} seconds.\n") @@ -431,8 +431,8 @@ def fit_dispersion_prior(self): self.fit_dispersion_trend() # Exclude genes with all zeroes - m = len(self.non_zero_genes) - p = self.design_matrix.shape[1] + num_genes = len(self.non_zero_genes) + num_vars = self.design_matrix.shape[1] # Fit dispersions to the curve, and compute log residuals disp_residuals = np.log(self.genewise_dispersions) - np.log( @@ -448,7 +448,7 @@ def fit_dispersion_prior(self): disp_residuals.loc[self.non_zero_genes][above_min_disp] ).median() ** 2 / norm.ppf(0.75) self.prior_disp_var = np.maximum( - self._squared_logres - polygamma(1, (m - p) / 2), 0.25 + self._squared_logres - polygamma(1, (num_genes - num_vars) / 2), 0.25 ) def fit_MAP_dispersions(self): @@ -463,7 +463,7 @@ def fit_MAP_dispersions(self): # Exclude genes with all zeroes counts_nonzero = self.counts.loc[:, self.non_zero_genes].values - m = counts_nonzero.shape[1] + num_genes = counts_nonzero.shape[1] X = self.design_matrix.values mu_hat = self._mu_hat.values @@ -488,7 +488,7 @@ def fit_MAP_dispersions(self): True, True, ) - for i in range(m) + for i in range(num_genes) ) end = time.time() print(f"... done in {end-start:.2f} seconds.\n") @@ -524,7 +524,7 @@ def fit_LFC(self): # Exclude genes with all zeroes counts_nonzero = self.counts.loc[:, self.non_zero_genes].values - m = counts_nonzero.shape[1] + num_genes = counts_nonzero.shape[1] X = self.design_matrix.values size_factors = self.size_factors.values @@ -546,7 +546,7 @@ def fit_LFC(self): self.min_mu, self.beta_tol, ) - for i in range(m) + for i in range(num_genes) ) end = time.time() print(f"... done in {end-start:.2f} seconds.\n") @@ -593,7 +593,7 @@ def calculate_cooks(self): if not hasattr(self, "dispersions"): self.fit_MAP_dispersions() - p = self.design_matrix.shape[1] + num_vars = self.design_matrix.shape[1] # Keep only non-zero genes normed_counts = self.counts.loc[:, self.non_zero_genes].div(self.size_factors, 0) dispersions = robust_method_of_moments_disp(normed_counts, self.design_matrix) @@ -603,7 +603,7 @@ def calculate_cooks(self): ) ** 2 / V self.cooks = ( squared_pearson_res - / p + / num_vars * (self._hat_diagonals / (1 - self._hat_diagonals) ** 2) ).T @@ -644,7 +644,7 @@ def _replace_outliers(self): if not hasattr(self, "cooks"): self.calculate_cooks() - m, p = self.design_matrix.shape + num_samples, num_vars = self.design_matrix.shape # Check whether cohorts have enough samples to allow refitting n_or_more = ( @@ -657,7 +657,7 @@ def _replace_outliers(self): self.replaceable.index = self.design_matrix.index # Get positions of counts with cooks above threshold - cooks_cutoff = f.ppf(0.99, p, m - p) + cooks_cutoff = f.ppf(0.99, num_vars, num_samples - num_vars) idx = (self.cooks > cooks_cutoff).T self.replaced = idx.any(axis=0) diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py index f0ba54ed..34b5392f 100644 --- a/pydeseq2/DeseqStats.py +++ b/pydeseq2/DeseqStats.py @@ -192,8 +192,8 @@ def run_wald_test(self): Get gene-wise p-values for gene over/under-expression.` """ - n = len(self.LFCs) - p = self.dds.design_matrix.shape[1] + num_genes = len(self.LFCs) + num_vars = self.dds.design_matrix.shape[1] # Raise a warning if LFCs are shrunk. if self.shrunk_LFCs: @@ -230,9 +230,9 @@ def run_wald_test(self): # Set regularization factors. if self.prior_disp_var is not None: - D = np.diag(1 / self.prior_disp_var**2) + ridge_factor = np.diag(1 / self.prior_disp_var**2) else: - D = np.diag(np.repeat(1e-6, p)) + ridge_factor = np.diag(np.repeat(1e-6, num_vars)) X = self.dds.design_matrix.values disps = self.dds.dispersions.values @@ -246,8 +246,10 @@ def run_wald_test(self): verbose=self.joblib_verbosity, batch_size=self.batch_size, )( - delayed(wald_test)(X, disps[i], LFCs[i], mu[:, i], D, self.contrast_idx) - for i in range(n) + delayed(wald_test)( + X, disps[i], LFCs[i], mu[:, i], ridge_factor, self.contrast_idx + ) + for i in range(num_genes) ) end = time.time() print(f"... done in {end-start:.2f} seconds.\n") @@ -281,7 +283,7 @@ def lfc_shrink(self): # Filter genes with all zero counts nonzero = self.dds.counts.sum(0) > 0 counts_nonzero = self.dds.counts.loc[:, nonzero].values - m = counts_nonzero.shape[1] + num_genes = counts_nonzero.shape[1] size = (1.0 / self.dds.dispersions[nonzero]).values offset = np.log(self.dds.size_factors).values @@ -311,7 +313,7 @@ def lfc_shrink(self): "L-BFGS-B", self.contrast_idx, ) - for i in range(m) + for i in range(num_genes) ) end = time.time() print(f"... done in {end-start:.2f} seconds.\n") @@ -425,13 +427,13 @@ def _cooks_filtering(self): if not hasattr(self, "p_values"): self.run_wald_test() - m, p = self.dds.design_matrix.shape - cooks_cutoff = f.ppf(0.99, p, m - p) + num_samples, num_vars = self.dds.design_matrix.shape + cooks_cutoff = f.ppf(0.99, num_vars, num_samples - num_vars) # If for a gene there are 3 samples or more that have more counts than the # maximum cooks sample, don't count this gene as an outlier. # Do this only if there are 2 cohorts. - if p == 2: + if num_vars == 2: # Check whether cohorts have enough samples to allow refitting # Only consider conditions with 3 or more samples (same as in R) n_or_more = ( diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index a9f1ab27..25feee73 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -372,7 +372,7 @@ def irls_solver( assert optimizer in ["BFGS", "L-BFGS-B"] - p = design_matrix.shape[1] + num_vars = design_matrix.shape[1] # converting to numpy for speed if type(counts) == pd.Series: @@ -385,7 +385,7 @@ def irls_solver( X = design_matrix # if full rank, estimate initial betas for IRLS below - if np.linalg.matrix_rank(X) == p: + if np.linalg.matrix_rank(X) == num_vars: Q, R = np.linalg.qr(X) y = np.log(counts / size_factors + 0.1) beta_init = solve(R, Q.T @ y) @@ -394,7 +394,7 @@ def irls_solver( dev = 1000 dev_ratio = 1.0 - D = np.diag(np.repeat(1e-6, p)) + ridge_factor = np.diag(np.repeat(1e-6, num_vars)) mu = np.maximum(size_factors * np.exp(X @ beta), min_mu) converged = True @@ -403,7 +403,7 @@ def irls_solver( W = mu / (1.0 + mu * disp) z = np.log(mu / size_factors) + (counts - mu) / mu - H = (X.T * W) @ X + D + H = (X.T * W) @ X + ridge_factor beta_hat = solve(H, X.T @ (W * z), assume_a="pos") i += 1 @@ -412,14 +412,14 @@ def irls_solver( def f(beta): # closure to minimize mu_ = np.maximum(size_factors * np.exp(X @ beta), min_mu) - return nb_nll(counts, mu_, disp) + 0.5 * (D @ beta**2).sum() + return nb_nll(counts, mu_, disp) + 0.5 * (ridge_factor @ beta**2).sum() def df(beta): mu_ = np.maximum(size_factors * np.exp(X @ beta), min_mu) return ( -X.T @ counts + ((1 / disp + counts) * mu_ / (1 / disp + mu_)) @ X - + D @ beta + + ridge_factor @ beta ) res = minimize( @@ -427,14 +427,16 @@ def df(beta): beta_init, jac=df, method=optimizer, - bounds=[(min_beta, max_beta)] * p if optimizer == "L-BFGS-B" else None, + bounds=[(min_beta, max_beta)] * num_vars + if optimizer == "L-BFGS-B" + else None, ) beta = res.x mu = np.maximum(size_factors * np.exp(X @ beta), min_mu) converged = res.success - if not res.success and p <= 2: + if not res.success and num_vars <= 2: beta = grid_fit_beta( counts, size_factors, @@ -455,7 +457,7 @@ def df(beta): # Compute H diagonal (useful for Cook distance outlier filtering) W = mu / (1.0 + mu * disp) W_sq = np.sqrt(W) - XtWX = (X.T * W) @ X + D + XtWX = (X.T * W) @ X + ridge_factor H = W_sq * np.diag(X @ np.linalg.inv(XtWX) @ X.T) * W_sq # Return an UNthresholded mu (as in the R code) # Previous quantities are estimated with a threshold though @@ -683,7 +685,6 @@ def trimmed_variance(x, trim=0.125, axis=1): return 1.51 * trimmed_mean(sqerror, trim=trim, axis=axis) -# TODO : bug here in multi-factor? def fit_lin_mu(y, size_factors, X, min_mu=0.5): """Estimate mean of negative binomial model using a linear regression. @@ -716,7 +717,7 @@ def fit_lin_mu(y, size_factors, X, min_mu=0.5): return np.maximum(mu_hat, min_mu) -def wald_test(X, disp, lfc, mu, D, idx=-1): +def wald_test(X, disp, lfc, mu, ridge_factor, idx=-1): """Run Wald test for differential expression. Computes Wald statistics, standard error and p-values from @@ -736,7 +737,7 @@ def wald_test(X, disp, lfc, mu, D, idx=-1): mu : float Mean estimation for the NB model. - D : ndarray + ridge_factor : ndarray Regularization factors. idx : int @@ -757,7 +758,7 @@ def wald_test(X, disp, lfc, mu, D, idx=-1): # Build covariance matrix estimator W = np.diag(mu / (1 + mu * disp)) M = X.T @ W @ X - H = np.linalg.inv(M + D) + H = np.linalg.inv(M + ridge_factor) S = H @ M @ H # Evaluate standard error and Wald statistic wald_se = np.sqrt(S[idx, idx]) @@ -789,7 +790,7 @@ def fit_rough_dispersions(counts, size_factors, design_matrix): Estimated dispersion parameter for each gene. """ - m, p = design_matrix.shape + num_samples, num_vars = design_matrix.shape normed_counts = counts.div(size_factors, 0) # Exclude genes with all zeroes normed_counts = normed_counts.loc[:, ~(normed_counts == 0).all()] @@ -797,7 +798,9 @@ def fit_rough_dispersions(counts, size_factors, design_matrix): reg.fit(design_matrix, normed_counts) y_hat = reg.predict(design_matrix) y_hat = np.maximum(y_hat, 1) - alpha_rde = (((normed_counts - y_hat) ** 2 - y_hat) / ((m - p) * y_hat**2)).sum(0) + alpha_rde = ( + ((normed_counts - y_hat) ** 2 - y_hat) / ((num_samples - num_vars) * y_hat**2) + ).sum(0) return np.maximum(alpha_rde, 0) @@ -949,17 +952,17 @@ def nbinomGLM( Whether L-BFGS-B converged. """ - p = design_matrix.shape[-1] + num_vars = design_matrix.shape[-1] - shrink_mask = np.zeros(p) + shrink_mask = np.zeros(num_vars) shrink_mask[shrink_index] = 1 - no_shrink_mask = np.ones(p) - shrink_mask + no_shrink_mask = np.ones(num_vars) - shrink_mask - beta_init = np.ones(p) * 0.1 * (-1) ** (np.arange(p)) + beta_init = np.ones(num_vars) * 0.1 * (-1) ** (np.arange(num_vars)) # Set optimization scale scale_cnst = nbinomFn( - np.zeros(p), + np.zeros(num_vars), design_matrix, counts, size, @@ -1029,7 +1032,7 @@ def ddf(beta, cnst=scale_cnst): beta = res.x converged = res.success - if not converged and p == 2: + if not converged and num_vars == 2: # If the solver failed, fit using grid search (slow) # Only for single-factor analysis beta = grid_fit_shrink_beta( @@ -1096,11 +1099,11 @@ def nbinomFn( Sum of the NB negative likelihood and apeGLM prior. """ - p = design_matrix.shape[-1] + num_vars = design_matrix.shape[-1] - shrink_mask = np.zeros(p) + shrink_mask = np.zeros(num_vars) shrink_mask[shrink_index] = 1 - no_shrink_mask = np.ones(p) - shrink_mask + no_shrink_mask = np.ones(num_vars) - shrink_mask xbeta = design_matrix @ beta prior = ( From cd85df801d768c18abe41198d1ba71d80793c606 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 9 Jan 2023 11:13:44 +0100 Subject: [PATCH 19/25] fix: change attribute name from design_factor to design_factors everywhere --- pydeseq2/DeseqDataSet.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py index b279ba13..b7fa113f 100644 --- a/pydeseq2/DeseqDataSet.py +++ b/pydeseq2/DeseqDataSet.py @@ -188,9 +188,11 @@ def __init__( self.design_factors = ( [design_factors] if isinstance(design_factors, str) else design_factors ) - if self.clinical[self.design_factor].isna().any(): + if self.clinical[self.design_factors].isna().any(): raise ValueError("NaNs are not allowed in the design factor.") - self.clinical[self.design_factor] = self.clinical[self.design_factor].astype(str) + self.clinical[self.design_factors] = self.clinical[self.design_factors].astype( + str + ) # Build the design matrix (splits the dataset in cohorts) self.design_matrix = build_design_matrix( From a2b39cc305cf67c2bea518ccc5e6adcfaf023530 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 9 Jan 2023 11:19:37 +0100 Subject: [PATCH 20/25] fix: fix design factor test --- pydeseq2/DeseqDataSet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py index b7fa113f..1d26324e 100644 --- a/pydeseq2/DeseqDataSet.py +++ b/pydeseq2/DeseqDataSet.py @@ -188,7 +188,7 @@ def __init__( self.design_factors = ( [design_factors] if isinstance(design_factors, str) else design_factors ) - if self.clinical[self.design_factors].isna().any(): + if self.clinical[self.design_factors].isna().any().any(): raise ValueError("NaNs are not allowed in the design factor.") self.clinical[self.design_factors] = self.clinical[self.design_factors].astype( str From 8c2197a349d45c5a089b23981c58e0f270c3c66a Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 9 Jan 2023 11:52:32 +0100 Subject: [PATCH 21/25] docs: expand the description of the "contrast" attribute --- pydeseq2/DeseqStats.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py index 34b5392f..9e07f010 100644 --- a/pydeseq2/DeseqStats.py +++ b/pydeseq2/DeseqStats.py @@ -33,6 +33,9 @@ class DeseqStats: A list of three strings, in the following format: ['variable_of_interest', 'tested_level', 'reference_level']. Names must correspond to the clinical data passed to the DeseqDataSet. + E.g., ['condition', 'B', 'A'] will measure the LFC of 'condition B' compared to + 'condition A'. If None, the last variable from the design matrix is chosen + as the variable of interest, and the reference level is picked alphabetically. (default: None). alpha : float From 39f4211014399175ce08cc18e24141ffd5f08936 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 9 Jan 2023 12:00:14 +0100 Subject: [PATCH 22/25] feat: add tests on the contrast variable --- pydeseq2/DeseqStats.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py index 9e07f010..67021ed5 100644 --- a/pydeseq2/DeseqStats.py +++ b/pydeseq2/DeseqStats.py @@ -124,11 +124,16 @@ def __init__( self.dds = dds - # Build contrast if None - if contrast is not None: - # TODO : tests on the contrast + if contrast is not None: # Test contrast if provided + assert ( + contrast[0] in self.dds.design_factors + ), "The contrast variable should be one of the design factors." + assert ( + contrast[1] in self.dds.clinical[contrast[0]].values + and contrast[2] in self.dds.clinical[contrast[0]].values + ), "The contrast levels should correspond to design factors levels." self.contrast = contrast - else: + else: # Build contrast if None factor = self.dds.design_factors[-1] levels = np.unique(self.dds.clinical[factor]).astype(str) if "_".join([factor, levels[0]]) == self.dds.design_matrix.columns[-1]: From b0e35f50c69f4c625a67896f4a9837972fc59fe9 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 9 Jan 2023 13:19:06 +0100 Subject: [PATCH 23/25] feat: add tests on the contrast variable --- pydeseq2/DeseqStats.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py index 67021ed5..c688f171 100644 --- a/pydeseq2/DeseqStats.py +++ b/pydeseq2/DeseqStats.py @@ -125,6 +125,7 @@ def __init__( self.dds = dds if contrast is not None: # Test contrast if provided + assert len(contrast) == 3, "The contrast should contain three strings." assert ( contrast[0] in self.dds.design_factors ), "The contrast variable should be one of the design factors." From 3c0ded613d38aacac433983c9871e3f3210c37b5 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 9 Jan 2023 13:46:49 +0100 Subject: [PATCH 24/25] refactor: change variable names for more meaningful ones and give variable names when calling functions --- pydeseq2/DeseqDataSet.py | 62 +++++++++++++++++----------------- pydeseq2/DeseqStats.py | 23 ++++++++----- pydeseq2/grid_search.py | 41 +++++++++++++---------- pydeseq2/utils.py | 72 +++++++++++++++++++++++----------------- 4 files changed, 110 insertions(+), 88 deletions(-) diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py index 1d26324e..5e2cfed8 100644 --- a/pydeseq2/DeseqDataSet.py +++ b/pydeseq2/DeseqDataSet.py @@ -289,10 +289,10 @@ def fit_genewise_dispersions(self): batch_size=self.batch_size, )( delayed(fit_lin_mu)( - counts_nonzero[:, i], - size_factors, - X, - self.min_mu, + counts=counts_nonzero[:, i], + size_factors=size_factors, + design_matrix=X, + min_mu=self.min_mu, ) for i in range(num_genes) ) @@ -305,12 +305,12 @@ def fit_genewise_dispersions(self): batch_size=self.batch_size, )( delayed(irls_solver)( - counts_nonzero[:, i], - size_factors, - X, - rough_disps[i], - self.min_mu, - self.beta_tol, + counts=counts_nonzero[:, i], + size_factors=size_factors, + design_matrix=X, + disp=rough_disps[i], + min_mu=self.min_mu, + beta_tol=self.beta_tol, ) for i in range(num_genes) ) @@ -329,12 +329,12 @@ def fit_genewise_dispersions(self): with parallel_backend("loky", inner_max_num_threads=1): res = Parallel(n_jobs=self.n_processes, batch_size=self.batch_size)( delayed(fit_alpha_mle)( - counts_nonzero[:, i], - X, - mu_hat_[i, :], - rough_disps[i], - self.min_disp, - self.max_disp, + counts=counts_nonzero[:, i], + design_matrix=X, + mu=mu_hat_[i, :], + alpha_hat=rough_disps[i], + min_disp=self.min_disp, + max_disp=self.max_disp, ) for i in range(num_genes) ) @@ -478,15 +478,15 @@ def fit_MAP_dispersions(self): batch_size=self.batch_size, )( delayed(fit_alpha_mle)( - counts_nonzero[:, i], - X, - mu_hat[:, i], - fit_disps[i], - self.min_disp, - self.max_disp, - self.prior_disp_var, - True, - True, + counts=counts_nonzero[:, i], + design_matrix=X, + mu=mu_hat[:, i], + alpha_hat=fit_disps[i], + min_disp=self.min_disp, + max_disp=self.max_disp, + prior_disp_var=self.prior_disp_var, + cr_reg=True, + prior_reg=True, ) for i in range(num_genes) ) @@ -539,12 +539,12 @@ def fit_LFC(self): batch_size=self.batch_size, )( delayed(irls_solver)( - counts_nonzero[:, i], - size_factors, - X, - disps[i], - self.min_mu, - self.beta_tol, + counts=counts_nonzero[:, i], + size_factors=size_factors, + design_matrix=X, + disp=disps[i], + min_mu=self.min_mu, + beta_tol=self.beta_tol, ) for i in range(num_genes) ) diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py index c688f171..2100beb8 100644 --- a/pydeseq2/DeseqStats.py +++ b/pydeseq2/DeseqStats.py @@ -256,7 +256,12 @@ def run_wald_test(self): batch_size=self.batch_size, )( delayed(wald_test)( - X, disps[i], LFCs[i], mu[:, i], ridge_factor, self.contrast_idx + design_matrix=X, + disp=disps[i], + lfc=LFCs[i], + mu=mu[:, i], + ridge_factor=ridge_factor, + idx=self.contrast_idx, ) for i in range(num_genes) ) @@ -313,14 +318,14 @@ def lfc_shrink(self): batch_size=self.batch_size, )( delayed(nbinomGLM)( - X, - counts_nonzero[:, i], - size[i], - offset, - prior_no_shrink_scale, - prior_scale, - "L-BFGS-B", - self.contrast_idx, + design_matrix=X, + counts=counts_nonzero[:, i], + size=size[i], + offset=offset, + prior_no_shrink_scale=prior_no_shrink_scale, + prior_scale=prior_scale, + optimizer="L-BFGS-B", + shrink_index=self.contrast_idx, ) for i in range(num_genes) ) diff --git a/pydeseq2/grid_search.py b/pydeseq2/grid_search.py index abb048a6..efac147f 100644 --- a/pydeseq2/grid_search.py +++ b/pydeseq2/grid_search.py @@ -4,14 +4,14 @@ import pydeseq2.utils -def vec_nb_nll(y, mu, alpha): +def vec_nb_nll(counts, mu, alpha): """Return the negative log-likelihood of a negative binomial. Vectorized version. Parameters ---------- - y : ndarray + counts : ndarray Observations. mu : float @@ -24,33 +24,35 @@ def vec_nb_nll(y, mu, alpha): Returns ------- float - Negative log likelihood of the observations y following + Negative log likelihood of the observations counts following :math:`NB(\\mu, \\alpha)`. """ - n = len(y) + n = len(counts) alpha_neg1 = 1 / alpha logbinom = ( - gammaln(y[:, None] + alpha_neg1) - gammaln(y + 1)[:, None] - gammaln(alpha_neg1) + gammaln(counts[:, None] + alpha_neg1) + - gammaln(counts + 1)[:, None] + - gammaln(alpha_neg1) ) if len(mu.shape) == 1: return n * alpha_neg1 * np.log(alpha) + ( -logbinom - + (y[:, None] + alpha_neg1) * np.log(mu[:, None] + alpha_neg1) - - (y * np.log(mu))[:, None] + + (counts[:, None] + alpha_neg1) * np.log(mu[:, None] + alpha_neg1) + - (counts * np.log(mu))[:, None] ).sum(0) else: return n * alpha_neg1 * np.log(alpha) + ( -logbinom - + (y[:, None] + alpha_neg1) * np.log(mu + alpha_neg1) - - (y[:, None] * np.log(mu)) + + (counts[:, None] + alpha_neg1) * np.log(mu + alpha_neg1) + - (counts[:, None] * np.log(mu)) ).sum(0) def grid_fit_alpha( - y, - X, + counts, + design_matrix, mu, alpha_hat, min_disp, @@ -66,10 +68,10 @@ def grid_fit_alpha( Parameters ---------- - y : ndarray + counts : ndarray Raw counts for a given gene. - X : ndarray + design_matrix : ndarray Design matrix. mu : ndarray @@ -113,11 +115,14 @@ def loss(log_alpha): reg = 0 if cr_reg: reg += ( - 0.5 * np.linalg.slogdet((X.T[:, :, None] * W).transpose(2, 0, 1) @ X)[1] + 0.5 + * np.linalg.slogdet( + (design_matrix.T[:, :, None] * W).transpose(2, 0, 1) @ design_matrix + )[1] ) if prior_reg: reg += (np.log(alpha) - np.log(alpha_hat)) ** 2 / (2 * prior_disp_var) - return vec_nb_nll(y, mu, alpha) + reg + return vec_nb_nll(counts, mu, alpha) + reg ll_grid = loss(grid) @@ -135,7 +140,7 @@ def loss(log_alpha): def grid_fit_beta( counts, size_factors, - X, + design_matrix, disp, min_mu=0.5, grid_length=60, @@ -155,7 +160,7 @@ def grid_fit_beta( size_factors : pandas.Series DESeq2 normalization factors. - X : ndarray + design_matrix : ndarray Design matrix. disp : float @@ -184,7 +189,7 @@ def grid_fit_beta( def loss(beta): # closure to minimize - mu = np.maximum(size_factors[:, None] * np.exp(X @ beta.T), min_mu) + mu = np.maximum(size_factors[:, None] * np.exp(design_matrix @ beta.T), min_mu) return vec_nb_nll(counts, mu, disp) + 0.5 * (1e-6 * beta**2).sum(1) for i, x in enumerate(x_grid): diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index 25feee73..cdcd65aa 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -234,14 +234,14 @@ def dispersion_trend(normed_mean, coeffs): return coeffs[0] + coeffs[1] / normed_mean -def nb_nll(y, mu, alpha): +def nb_nll(counts, mu, alpha): """Negative log-likelihood of a negative binomial. Unvectorized version. Parameters ---------- - y : ndarray + counts : ndarray Observations. mu : float @@ -254,27 +254,31 @@ def nb_nll(y, mu, alpha): Returns ------- float - Negative log likelihood of the observations y + Negative log likelihood of the observations counts following :math:`NB(\\mu, \\alpha)`. """ - n = len(y) + n = len(counts) alpha_neg1 = 1 / alpha - logbinom = gammaln(y + alpha_neg1) - gammaln(y + 1) - gammaln(alpha_neg1) + logbinom = gammaln(counts + alpha_neg1) - gammaln(counts + 1) - gammaln(alpha_neg1) return ( n * alpha_neg1 * np.log(alpha) - + (-logbinom + (y + alpha_neg1) * np.log(alpha_neg1 + mu) - y * np.log(mu)).sum() + + ( + -logbinom + + (counts + alpha_neg1) * np.log(alpha_neg1 + mu) + - counts * np.log(mu) + ).sum() ) -def dnb_nll(y, mu, alpha): +def dnb_nll(counts, mu, alpha): """Gradient of the negative log-likelihood of a negative binomial. Unvectorized. Parameters ---------- - y : ndarray + counts : ndarray Observations. mu : float @@ -295,9 +299,9 @@ def dnb_nll(y, mu, alpha): alpha_neg1**2 * ( polygamma(0, alpha_neg1) - - polygamma(0, y + alpha_neg1) + - polygamma(0, counts + alpha_neg1) + np.log(1 + mu * alpha) - + (y - mu) / (mu + alpha_neg1) + + (counts - mu) / (mu + alpha_neg1) ).sum() ) @@ -364,7 +368,7 @@ def irls_solver( mu: ndarray Means estimated from size factors and beta: - :math:`\\mu = s_{ij} \\exp(\\beta^t X)`. + :math:`\\mu = s_{ij} \\exp(\\beta^t design_matrix)`. H: ndarray Diagonal of the :math:`W^{1/2} X (X^t W X)^-1 X^t W^{1/2}` covariance matrix. @@ -466,8 +470,8 @@ def df(beta): def fit_alpha_mle( - y, - X, + counts, + design_matrix, mu, alpha_hat, min_disp, @@ -479,15 +483,15 @@ def fit_alpha_mle( ): """Estimate the dispersion parameter of a negative binomial GLM. - Note: it is possible to pass pandas Series as y, X and mu arguments but using numpy - arrays makes the code significantly faster. + Note: it is possible to pass counts, design_matrix and mu arguments in the form of + pandas Series, but using numpy arrays makes the code significantly faster. Parameters ---------- - y : ndarray + counts : ndarray Raw counts for a given gene. - X : ndarray + design_matrix : ndarray Design matrix. mu : ndarray @@ -537,10 +541,10 @@ def loss(log_alpha): W = mu / (1 + mu * alpha) reg = 0 if cr_reg: - reg += 0.5 * np.linalg.slogdet((X.T * W) @ X)[1] + reg += 0.5 * np.linalg.slogdet((design_matrix.T * W) @ design_matrix)[1] if prior_reg: reg += (np.log(alpha) - np.log(alpha_hat)) ** 2 / (2 * prior_disp_var) - return nb_nll(y, mu, alpha) + reg + return nb_nll(counts, mu, alpha) + reg def dloss(log_alpha): # gradient closure @@ -549,10 +553,16 @@ def dloss(log_alpha): dW = -(W**2) reg_grad = 0 if cr_reg: - reg_grad += 0.5 * (np.linalg.inv((X.T * W) @ X) * ((X.T * dW) @ X)).sum() + reg_grad += ( + 0.5 + * ( + np.linalg.inv((design_matrix.T * W) @ design_matrix) + * ((design_matrix.T * dW) @ design_matrix) + ).sum() + ) if prior_reg: reg_grad += (np.log(alpha) - np.log(alpha_hat)) / (alpha * prior_disp_var) - return dnb_nll(y, mu, alpha) + reg_grad + return dnb_nll(counts, mu, alpha) + reg_grad res = minimize( lambda x: loss(x[0]), @@ -568,7 +578,9 @@ def dloss(log_alpha): return np.exp(res.x[0]), res.success else: return ( - np.exp(grid_fit_alpha(y, X, mu, alpha_hat, min_disp, max_disp)), + np.exp( + grid_fit_alpha(counts, design_matrix, mu, alpha_hat, min_disp, max_disp) + ), res.success, ) @@ -685,20 +697,20 @@ def trimmed_variance(x, trim=0.125, axis=1): return 1.51 * trimmed_mean(sqerror, trim=trim, axis=axis) -def fit_lin_mu(y, size_factors, X, min_mu=0.5): +def fit_lin_mu(counts, size_factors, design_matrix, min_mu=0.5): """Estimate mean of negative binomial model using a linear regression. Used to initialize genewise dispersion models. Parameters ---------- - y : ndarray + counts : ndarray Raw counts for a given gene. size_factors : ndarray Sample-wise scaling factors (obtained from median-of-ratios). - X : ndarray + design_matrix : ndarray Design matrix. min_mu : float @@ -711,13 +723,13 @@ def fit_lin_mu(y, size_factors, X, min_mu=0.5): """ reg = LinearRegression(fit_intercept=False) - reg.fit(X, y / size_factors) - mu_hat = size_factors * reg.predict(X) + reg.fit(design_matrix, counts / size_factors) + mu_hat = size_factors * reg.predict(design_matrix) # Threshold mu_hat as 1/mu_hat will be used later on. return np.maximum(mu_hat, min_mu) -def wald_test(X, disp, lfc, mu, ridge_factor, idx=-1): +def wald_test(design_matrix, disp, lfc, mu, ridge_factor, idx=-1): """Run Wald test for differential expression. Computes Wald statistics, standard error and p-values from @@ -725,7 +737,7 @@ def wald_test(X, disp, lfc, mu, ridge_factor, idx=-1): Parameters ---------- - X : ndarray + design_matrix : ndarray Design matrix. disp : float @@ -757,7 +769,7 @@ def wald_test(X, disp, lfc, mu, ridge_factor, idx=-1): # Build covariance matrix estimator W = np.diag(mu / (1 + mu * disp)) - M = X.T @ W @ X + M = design_matrix.T @ W @ design_matrix H = np.linalg.inv(M + ridge_factor) S = H @ M @ H # Evaluate standard error and Wald statistic From 4be92a92447d54c35406f34fe764fc54db22dc72 Mon Sep 17 00:00:00 2001 From: Boris MUZELLEC Date: Mon, 9 Jan 2023 14:04:01 +0100 Subject: [PATCH 25/25] refactor: change function/variable names to more meanignful ones --- notebooks/PyDESeq2_minimal_example.ipynb | 6 +++--- notebooks/PyDESeq2_step_by_step_pipeline.ipynb | 6 +++--- pydeseq2/DeseqDataSet.py | 8 ++++---- pydeseq2/utils.py | 18 ++++++++++-------- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/notebooks/PyDESeq2_minimal_example.ipynb b/notebooks/PyDESeq2_minimal_example.ipynb index fbb201d4..4079415c 100644 --- a/notebooks/PyDESeq2_minimal_example.ipynb +++ b/notebooks/PyDESeq2_minimal_example.ipynb @@ -36,7 +36,7 @@ "\n", "from pydeseq2.DeseqDataSet import DeseqDataSet\n", "from pydeseq2.DeseqStats import DeseqStats\n", - "from pydeseq2.utils import load_data" + "from pydeseq2.utils import load_example_data" ] }, { @@ -89,7 +89,7 @@ }, "outputs": [], "source": [ - "counts_df = load_data(\n", + "counts_df = load_example_data(\n", " modality=\"raw_counts\",\n", " cancer_type=CANCER,\n", " debug=False,\n", @@ -103,7 +103,7 @@ "metadata": {}, "outputs": [], "source": [ - "clinical_df = load_data(\n", + "clinical_df = load_example_data(\n", " modality=\"clinical\",\n", " cancer_type=CANCER,\n", " debug=False,\n", diff --git a/notebooks/PyDESeq2_step_by_step_pipeline.ipynb b/notebooks/PyDESeq2_step_by_step_pipeline.ipynb index b338ab8b..b9ea014a 100644 --- a/notebooks/PyDESeq2_step_by_step_pipeline.ipynb +++ b/notebooks/PyDESeq2_step_by_step_pipeline.ipynb @@ -36,7 +36,7 @@ "\n", "from pydeseq2.DeseqDataSet import DeseqDataSet\n", "from pydeseq2.DeseqStats import DeseqStats\n", - "from pydeseq2.utils import load_data" + "from pydeseq2.utils import load_example_data" ] }, { @@ -87,7 +87,7 @@ "metadata": {}, "outputs": [], "source": [ - "counts_df = load_data(\n", + "counts_df = load_example_data(\n", " modality=\"raw_counts\",\n", " cancer_type=CANCER,\n", " debug=False,\n", @@ -101,7 +101,7 @@ "metadata": {}, "outputs": [], "source": [ - "clinical_df = load_data(\n", + "clinical_df = load_example_data(\n", " modality=\"clinical\",\n", " cancer_type=CANCER,\n", " debug=False,\n", diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py index 5e2cfed8..db643193 100644 --- a/pydeseq2/DeseqDataSet.py +++ b/pydeseq2/DeseqDataSet.py @@ -184,20 +184,20 @@ def __init__( # Import clinical data and convert design_column to string self.clinical = clinical.loc[self.counts.index] - # Convert design factor to list if a single string was provided. + # Convert design_factors to list if a single string was provided. self.design_factors = ( [design_factors] if isinstance(design_factors, str) else design_factors ) if self.clinical[self.design_factors].isna().any().any(): - raise ValueError("NaNs are not allowed in the design factor.") + raise ValueError("NaNs are not allowed in the design factors.") self.clinical[self.design_factors] = self.clinical[self.design_factors].astype( str ) # Build the design matrix (splits the dataset in cohorts) self.design_matrix = build_design_matrix( - self.clinical, - design_factors, + clinical_df=self.clinical, + design_factors=self.design_factors, ref=reference_level, expanded=False, intercept=True, diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py index cdcd65aa..4e9d61e6 100644 --- a/pydeseq2/utils.py +++ b/pydeseq2/utils.py @@ -17,7 +17,7 @@ from pydeseq2.grid_search import grid_fit_shrink_beta -def load_data( +def load_example_data( modality="raw_counts", cancer_type="synthetic", debug=False, @@ -138,7 +138,7 @@ def test_valid_counts(counts_df): def build_design_matrix( - clinical_df, design="high_grade", ref=None, expanded=False, intercept=True + clinical_df, design_factors="high_grade", ref=None, expanded=False, intercept=True ): """Build design_matrix matrix for DEA. @@ -152,7 +152,7 @@ def build_design_matrix( DataFrame containing clinical information. Must be indexed by sample barcodes, and contain a "high_grade" column. - design : str or list[str] + design_factors : str or list[str] Name of the columns of clinical_df to be used as design_matrix variables. (default: "high_grade"). @@ -175,19 +175,21 @@ def build_design_matrix( Indexed by sample barcodes. """ - if isinstance(design, str): # if there is a single factor, convert to singleton list - design = [design] + if isinstance( + design_factors, str + ): # if there is a single factor, convert to singleton list + design_factors = [design_factors] - design_matrix = pd.get_dummies(clinical_df[design]) + design_matrix = pd.get_dummies(clinical_df[design_factors]) - for factor in design: + for factor in design_factors: # Check that each factor has exactly 2 levels if len(np.unique(clinical_df[factor])) != 2: raise ValueError( f"Factors should take exactly two values, but {factor} " f"takes values {np.unique(clinical_df[factor])}." ) - factor = design[-1] + factor = design_factors[-1] if ref is None: ref = "_".join([factor, np.sort(np.unique(clinical_df[factor]).astype(str))[0]]) ref_level = design_matrix.pop(ref)