From b990ca22274ac2b783de3845edc3adda2506ec3e Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 2 Jan 2023 10:25:19 +0100
Subject: [PATCH 01/25] feat: allow a list of factors in build_design_matrix

---
 pydeseq2/utils.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
index 98013d7b..863a0fc6 100644
--- a/pydeseq2/utils.py
+++ b/pydeseq2/utils.py
@@ -151,9 +151,10 @@ def build_design_matrix(
         DataFrame containing clinical information.
         Must be indexed by sample barcodes, and contain a "high_grade" column.
 
-    design : str
+    design : str TODO : or list
         Name of the column of clinical_df to be used as a design_matrix variable.
         (default: "high_grade").
+        TODO : last is the "reference"
 
     ref : str
         The factor to use as a reference. Must be one of the values taken by the design.
@@ -176,6 +177,8 @@ def build_design_matrix(
 
     design_matrix = pd.get_dummies(clinical_df[design])
 
+    # TODO : check that each factor takes only two values
+
     if design_matrix.shape[-1] == 1:
         raise ValueError(
             f"The design factor takes only one value: "
@@ -200,8 +203,12 @@ def build_design_matrix(
             )
             raise e
         design_matrix.insert(1, ref, ref_level)
-    if not expanded:  # drop last factor
-        design_matrix.drop(columns=design_matrix.columns[-1], axis=1, inplace=True)
+    if not expanded:  # drop duplicate factors
+        # TODO : remove commented line
+        # design_matrix.drop(columns=design_matrix.columns[-1], axis=1, inplace=True)
+        design_matrix.drop(
+            columns=[col for col in clinical_df.columns[::-2]], axis=1, inplace=True
+        )  # Drops every other column, starting from the last
     if intercept:
         design_matrix.insert(0, "intercept", 1.0)
     return design_matrix

From 3d889bf58026238743f79bac95dbbaf8dd0ae0fc Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 2 Jan 2023 15:12:09 +0100
Subject: [PATCH 02/25] feat: improve multifactor design function and limit
 number of IRLS iterations

---
 pydeseq2/utils.py | 66 ++++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 32 deletions(-)

diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
index 863a0fc6..becec9b0 100644
--- a/pydeseq2/utils.py
+++ b/pydeseq2/utils.py
@@ -144,6 +144,7 @@ def build_design_matrix(
 
     Only single factor, 2-level designs are currently supported.
     Unless specified, the reference factor is chosen alphabetically.
+    NOTE: For now, each factor should have exactly two levels.
 
     Parameters
     ----------
@@ -151,10 +152,9 @@ def build_design_matrix(
         DataFrame containing clinical information.
         Must be indexed by sample barcodes, and contain a "high_grade" column.
 
-    design : str TODO : or list
-        Name of the column of clinical_df to be used as a design_matrix variable.
+    design : str or list
+        Name of the columns of clinical_df to be used as design_matrix variables.
         (default: "high_grade").
-        TODO : last is the "reference"
 
     ref : str
         The factor to use as a reference. Must be one of the values taken by the design.
@@ -175,39 +175,36 @@ def build_design_matrix(
         Indexed by sample barcodes.
     """
 
-    design_matrix = pd.get_dummies(clinical_df[design])
+    if isinstance(design, str):  # if there is a single factor, convert to singleton list
+        design = [design]
 
-    # TODO : check that each factor takes only two values
+    design_matrix = pd.get_dummies(clinical_df[design])
 
-    if design_matrix.shape[-1] == 1:
-        raise ValueError(
-            f"The design factor takes only one value: "
-            f"{design_matrix.columns.values.tolist()}, but two are necessary."
-        )
-    elif design_matrix.shape[-1] > 2:
-        raise ValueError(
-            f"The design factor takes more than two values: "
-            f"{design_matrix.columns.values.tolist()}, but should have exactly two."
-        )
-    # Sort columns alphabetically
-    design_matrix = design_matrix.reindex(
-        sorted(design_matrix.columns, reverse=True), axis=1
-    )
-    if ref is not None:  # Put reference level last
+    for factor in design:
+        # Check that each factor has exactly 2 levels
+        if len(np.unique(clinical_df[factor])) != 2:
+            print(
+                f"Factors should take exactly two values, but {factor} "
+                f"takes values {np.unique(clinical_df[factor])}."
+            )
+    factor = design[-1]
+    if ref is None:
+        ref = "_".join([factor, np.sort(np.unique(clinical_df[factor]).astype(str))[0]])
+        ref_level = design_matrix.pop(ref)
+    else:  # Put reference level last
+        ref = "_".join([factor, ref])
         try:
-            ref_level = design_matrix.pop(ref)
+            ref_level = design_matrix.pop(f"{factor}_{ref}")
         except KeyError as e:
             print(
                 "The reference level must correspond to"
                 " one of the design factor values."
             )
             raise e
-        design_matrix.insert(1, ref, ref_level)
+    design_matrix.insert(design_matrix.shape[-1], ref, ref_level)
     if not expanded:  # drop duplicate factors
-        # TODO : remove commented line
-        # design_matrix.drop(columns=design_matrix.columns[-1], axis=1, inplace=True)
         design_matrix.drop(
-            columns=[col for col in clinical_df.columns[::-2]], axis=1, inplace=True
+            columns=[col for col in design_matrix.columns[::-2]], axis=1, inplace=True
         )  # Drops every other column, starting from the last
     if intercept:
         design_matrix.insert(0, "intercept", 1.0)
@@ -318,11 +315,11 @@ def irls_solver(
     min_beta=-30,
     max_beta=30,
     optimizer="L-BFGS-B",
+    maxiter=100,
 ):
     r"""Fit a NB GLM wit log-link to predict counts from the design matrix.
 
     See equations (1-2) in the DESeq2 paper.
-    Uses the L-BFGS-B, more stable than Fisher scoring.
 
     Parameters
     ----------
@@ -355,7 +352,11 @@ def irls_solver(
         Optimizing method to use in case IRLS starts diverging.
         Accepted values: 'BFGS' or 'L-BFGS-B'.
         NB: only 'L-BFGS-B' ensures that LFCS will
-        lay in the [min_beta, max_beta] range. (default: 'BFGS').
+        lay in the [min_beta, max_beta] range. (default: 'L-BFGS-B').
+
+    maxiter : int
+        Maximum number of IRLS iterations to perform before switching to L-BFGS-B.
+        (default: 100)
 
     Returns
     -------
@@ -398,13 +399,16 @@ def irls_solver(
     mu = np.maximum(size_factors * np.exp(X @ beta), min_mu)
 
     converged = True
+    i = 0
     while dev_ratio > beta_tol:
 
         W = mu / (1.0 + mu * disp)
         z = np.log(mu / size_factors) + (counts - mu) / mu
         H = (X.T * W) @ X + D
         beta_hat = solve(H, X.T @ (W * z), assume_a="pos")
-        if sum(np.abs(beta_hat) > max_beta) > 0:
+        i += 1
+
+        if sum(np.abs(beta_hat) > max_beta) > 0 or i >= maxiter:
             # If IRLS starts diverging, use L-BFGS-B
             def f(beta):
                 # closure to minimize
@@ -424,16 +428,14 @@ def df(beta):
                 beta_init,
                 jac=df,
                 method=optimizer,
-                bounds=[(min_beta, max_beta), (min_beta, max_beta)]
-                if optimizer == "L-BFGS-B"
-                else None,
+                bounds=[(min_beta, max_beta)] * p if optimizer == "L-BFGS-B" else None,
             )
 
             beta = res.x
             mu = np.maximum(size_factors * np.exp(X @ beta), min_mu)
             converged = res.success
 
-            if not res.success:
+            if not res.success and p <= 2:
                 beta = grid_fit_beta(
                     counts,
                     size_factors,

From ad43332c08aa43ad41306d38a98abcc9d847625e Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 2 Jan 2023 15:20:35 +0100
Subject: [PATCH 03/25] docs(DeseqDataSet): update to reflect that the design
 may take several factors

---
 pydeseq2/DeseqDataSet.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py
index 17bf63fc..21b5a268 100644
--- a/pydeseq2/DeseqDataSet.py
+++ b/pydeseq2/DeseqDataSet.py
@@ -43,8 +43,8 @@ class DeseqDataSet:
         DataFrame containing clinical information.
         Must be indexed by sample barcodes.
 
-    design_factor : str
-        Name of the column of clinical to be used as a design variable.
+    design_factor : str or list[str]
+        Name of the columns of clinical to be used as design variables.
         (default: 'high_grade').
 
     reference_level : str
@@ -182,7 +182,7 @@ def __init__(
                 "The count matrix and clinical data should have the same index."
             )
         self.design_factor = design_factor
-        if self.clinical[self.design_factor].isna().any():
+        if self.clinical[self.design_factor].isna().any().any():
             raise ValueError("NaNs are not allowed in the design factor.")
         self.clinical[self.design_factor] = self.clinical[self.design_factor].astype(str)
 

From 7f26c275bd07b4fd09f3a104824ad2f21acc4a8e Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 2 Jan 2023 15:23:54 +0100
Subject: [PATCH 04/25] docs(DeseqDataSet): update to reflect that the design
 may take several factors

---
 pydeseq2/DeseqDataSet.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py
index 21b5a268..7184824b 100644
--- a/pydeseq2/DeseqDataSet.py
+++ b/pydeseq2/DeseqDataSet.py
@@ -44,7 +44,8 @@ class DeseqDataSet:
         Must be indexed by sample barcodes.
 
     design_factor : str or list[str]
-        Name of the columns of clinical to be used as design variables.
+        Name of the columns of clinical to be used as design variables. If a list,
+        the last factor will be considered the variable of interest by default.
         (default: 'high_grade').
 
     reference_level : str

From d6382b7b6375f1ca6d8574705437e0034558b8c8 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 2 Jan 2023 16:49:01 +0100
Subject: [PATCH 05/25] feat: support pairwise multifactor tests

---
 pydeseq2/DeseqStats.py | 196 +++++++++++++++++++++++++----------------
 1 file changed, 118 insertions(+), 78 deletions(-)

diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py
index 1281f9df..b3f0fc9f 100644
--- a/pydeseq2/DeseqStats.py
+++ b/pydeseq2/DeseqStats.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 import statsmodels.api as sm
+from IPython.display import display
 from joblib import Parallel
 from joblib import delayed
 from joblib import parallel_backend
@@ -28,8 +29,15 @@ class DeseqStats:
     dds : DeseqDataSet
         DeseqDataSet for which dispersion and LFCs were already estimated.
 
+    contrast : list[str] or None
+        A list of three strings, in the following format:
+        ['variable_of_interest', 'tested_level', 'reference_level'].
+        Names must correspond to the clinical data passed to the DeseqDataSet.
+        (default: None)
+
     alpha : float
-        P-value and adjusted p-value significance threshold (usually 0.05). (default: 0.05).
+        P-value and adjusted p-value significance threshold (usually 0.05).
+        (default: 0.05).
 
     cooks_filter : bool
         Whether to filter p-values based on cooks outliers. (default: True).
@@ -56,6 +64,9 @@ class DeseqStats:
     base_mean : pandas.Series
         Genewise means of normalized counts.
 
+    contrast_idx : int
+        Index of the LFC column corresponding to the variable being tested.
+
     LFCs : pandas.DataFrame
         Estimated log-fold change between conditions and intercept, in natural log scale.
 
@@ -95,6 +106,7 @@ class DeseqStats:
     def __init__(
         self,
         dds,
+        contrast=None,
         alpha=0.05,
         cooks_filter=True,
         independent_filter=True,
@@ -108,6 +120,19 @@ def __init__(
         ), "Please provide a fitted DeseqDataSet by first running the `deseq2` method."
 
         self.dds = dds
+
+        # Build contrast if None
+        if contrast is not None:
+            # TODO : tests on the constrast
+            self.contrast = contrast
+        else:
+            factor = self.dds.design_factor[-1]
+            levels = np.unique(self.dds.clinical[factor]).astype(str)
+            if "_".join([factor, levels[0]]) == self.dds.design_matrix.columns[-1]:
+                self.contrast = [factor, levels[0], levels[1]]
+            else:
+                self.contrast = [factor, levels[1], levels[0]]
+
         self.alpha = alpha
         self.cooks_filter = cooks_filter
         self.independent_filter = independent_filter
@@ -121,14 +146,9 @@ def __init__(
         self.joblib_verbosity = joblib_verbosity
 
     def summary(self):
-        """Run the statistical analysis and return the results in a DataFrame.
-
-        The results are also stored in the `results_df` attribute.
+        """Run the statistical analysis.
 
-        Returns
-        -------
-        pandas.DataFrame
-            Estimated log fold changes, p-values and adjusted p-values for each gene.
+        The results are stored in the `results_df` attribute.
         """
 
         if not hasattr(self, "p_values"):
@@ -151,15 +171,99 @@ def summary(self):
         # Store the results in a DataFrame, in log2 scale for LFCs.
         self.results_df = pd.DataFrame(index=self.dds.dispersions.index)
         self.results_df["baseMean"] = self.base_mean
-        self.results_df["log2FoldChange"] = self.LFCs[
-            self.dds.design_matrix.columns[-1]
+        self.results_df["log2FoldChange"] = self.LFCs.iloc[
+            :, self.contrast_idx
         ] / np.log(2)
         self.results_df["lfcSE"] = self.SE / np.log(2)
         self.results_df["stat"] = self.statistics
         self.results_df["pvalue"] = self.p_values
         self.results_df["padj"] = self.padj
 
-        return self.results_df
+        print(
+            f"Log2 fold change & Wald test p-value: "
+            f"{self.contrast[0]} {self.contrast[1]} vs {self.contrast[2]}"
+        )
+        display(self.results_df)
+
+    def run_wald_test(self):
+        """Perform a Wald test.
+
+        Get gene-wise p-values for gene over/under-expression.`
+        """
+
+        n = len(self.LFCs)
+        p = self.dds.design_matrix.shape[1]
+
+        # Raise a warning if LFCs are shrunk.
+        if self.shrunk_LFCs:
+            print(
+                "Note: running Wald test on shrunk LFCs. "
+                "Some sequencing datasets show better performance with the testing "
+                "separated from the use of the LFC prior."
+            )
+
+        # Get the LFC column corresponding to the design variable.
+        if self.contrast is None:
+            self.contrast_idx = self.LFCs.columns.get_loc(
+                self.dds.design_matrix.columns[-1]
+            )
+        else:
+            # If the design matrix of dds is unexpanded, only one of the two levels
+            # will have a corresponding column
+            alternative_level = "_".join([self.contrast[0], self.contrast[1]])
+            try:
+                self.contrast_idx = self.LFCs.columns.get_loc(alternative_level)
+            except KeyError:
+                alternative_level = "_".join([self.contrast[0], self.contrast[2]])
+                self.contrast_idx = self.LFCs.columns.get_loc(alternative_level)
+                self.LFCs.iloc[:, self.contrast_idx] *= -1
+                pass
+
+        # Compute means according to the LFC model.
+        mu = (
+            np.exp(self.dds.design_matrix @ self.LFCs.T)
+            .multiply(self.dds.size_factors, 0)
+            .values
+        )
+
+        # Set regularization factors.
+        if self.prior_disp_var is not None:
+            D = np.diag(1 / self.prior_disp_var**2)
+        else:
+            D = np.diag(np.repeat(1e-6, p))
+
+        X = self.dds.design_matrix.values
+        disps = self.dds.dispersions.values
+        LFCs = self.LFCs.values
+
+        print("Running Wald tests...")
+        start = time.time()
+        with parallel_backend("loky", inner_max_num_threads=1):
+            res = Parallel(
+                n_jobs=self.n_processes,
+                verbose=self.joblib_verbosity,
+                batch_size=self.batch_size,
+            )(
+                delayed(wald_test)(X, disps[i], LFCs[i], mu[:, i], D, self.contrast_idx)
+                for i in range(n)
+            )
+        end = time.time()
+        print(f"... done in {end-start:.2f} seconds.\n")
+
+        pvals, stats, se = zip(*res)
+
+        self.p_values = pd.Series(pvals, index=self.LFCs.index)
+        self.statistics = pd.Series(stats, index=self.LFCs.index)
+        self.SE = pd.Series(se, index=self.LFCs.index)
+
+        # Account for possible all_zeroes due to outlier refitting in DESeqDataSet
+        if self.dds.refit_cooks and self.dds.replaced.sum() > 0:
+
+            self.SE.loc[self.dds.new_all_zeroes[self.dds.new_all_zeroes].index] = 0
+            self.statistics.loc[
+                self.dds.new_all_zeroes[self.dds.new_all_zeroes].index
+            ] = 0
+            self.p_values.loc[self.dds.new_all_zeroes[self.dds.new_all_zeroes].index] = 1
 
     def lfc_shrink(self):
         """LFC shrinkage with an apeGLM prior [2]_.
@@ -232,72 +336,6 @@ def lfc_shrink(self):
             self.results_df["lfcSE"] = self.SE / np.log(2)
             return self.results_df
 
-    def run_wald_test(self):
-        """Perform a Wald test.
-
-        Get gene-wise p-values for gene over/under-expression.`
-        """
-
-        n = len(self.LFCs)
-        p = self.dds.design_matrix.shape[1]
-
-        # Raise a warning if LFCs are shrunk.
-        if self.shrunk_LFCs:
-            print(
-                "Note: running Wald test on shrunk LFCs. "
-                "Some sequencing datasets show better performance with the testing "
-                "separated from the use of the LFC prior."
-            )
-
-        # Get the LFC column corresponding to the design variable.
-        idx = self.LFCs.columns.get_loc(self.dds.design_matrix.columns[-1])
-
-        # Compute means according to the LFC model.
-        mu = (
-            np.exp(self.dds.design_matrix @ self.LFCs.T)
-            .multiply(self.dds.size_factors, 0)
-            .values
-        )
-
-        # Set regularization factors.
-        if self.prior_disp_var is not None:
-            D = np.diag(1 / self.prior_disp_var**2)
-        else:
-            D = np.diag(np.repeat(1e-6, p))
-
-        X = self.dds.design_matrix.values
-        disps = self.dds.dispersions.values
-        LFCs = self.LFCs.values
-
-        print("Running Wald tests...")
-        start = time.time()
-        with parallel_backend("loky", inner_max_num_threads=1):
-            res = Parallel(
-                n_jobs=self.n_processes,
-                verbose=self.joblib_verbosity,
-                batch_size=self.batch_size,
-            )(
-                delayed(wald_test)(X, disps[i], LFCs[i], mu[:, i], D, idx)
-                for i in range(n)
-            )
-        end = time.time()
-        print(f"... done in {end-start:.2f} seconds.\n")
-
-        pvals, stats, se = zip(*res)
-
-        self.p_values = pd.Series(pvals, index=self.LFCs.index)
-        self.statistics = pd.Series(stats, index=self.LFCs.index)
-        self.SE = pd.Series(se, index=self.LFCs.index)
-
-        # Account for possible all_zeroes due to outlier refitting in DESeqDataSet
-        if self.dds.refit_cooks and self.dds.replaced.sum() > 0:
-
-            self.SE.loc[self.dds.new_all_zeroes[self.dds.new_all_zeroes].index] = 0
-            self.statistics.loc[
-                self.dds.new_all_zeroes[self.dds.new_all_zeroes].index
-            ] = 0
-            self.p_values.loc[self.dds.new_all_zeroes[self.dds.new_all_zeroes].index] = 1
-
     def _independent_filtering(self):
         """Compute adjusted p-values using independent filtering.
 
@@ -368,7 +406,7 @@ def _cooks_filtering(self):
             self.run_wald_test()
 
         m, p = self.dds.design_matrix.shape
-        if p == 2:
+        if p == 2:  # TODO : adapt to multifactor designs
             # If for a gene there are 3 samples or more that have more counts than the
             # maximum cooks sample, don't count this gene as an outlier.
             # Do this only if there are 2 cohorts.
@@ -426,6 +464,8 @@ def _fit_prior_var(self, min_var=1e-6, max_var=400):
         float
             Estimated prior variance.
         """
+
+        # TODO : adapt to multifactor
         keep = ~self.LFCs.iloc[:, 1].isna()
         S = self.LFCs[keep].iloc[:, 1] ** 2
         D = self.SE[keep] ** 2

From 6b2ebecf9c4e2669981eed38ba03f6e9f52aa88c Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 2 Jan 2023 17:24:38 +0100
Subject: [PATCH 06/25] fix(DeseqStats): handle multi-factor designs in
 _cooks_filtering

---
 pydeseq2/DeseqStats.py | 57 ++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 27 deletions(-)

diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py
index b3f0fc9f..19f5ab6b 100644
--- a/pydeseq2/DeseqStats.py
+++ b/pydeseq2/DeseqStats.py
@@ -406,45 +406,48 @@ def _cooks_filtering(self):
             self.run_wald_test()
 
         m, p = self.dds.design_matrix.shape
-        if p == 2:  # TODO : adapt to multifactor designs
-            # If for a gene there are 3 samples or more that have more counts than the
-            # maximum cooks sample, don't count this gene as an outlier.
-            # Do this only if there are 2 cohorts.
-            cooks_cutoff = f.ppf(0.99, p, m - p)
+        cooks_cutoff = f.ppf(0.99, p, m - p)
 
+        # If for a gene there are 3 samples or more that have more counts than the
+        # maximum cooks sample, don't count this gene as an outlier.
+        # Do this only if there are 2 cohorts.
+        if p == 2:
             # Check whether cohorts have enough samples to allow refitting
             # Only consider conditions with 3 or more samples (same as in R)
             n_or_more = (
-                self.dds.design_matrix[self.dds.design_matrix.columns[-1]].value_counts()
-                >= 3
+                self.dds.design_matrix.iloc[:, self.contrast_idx].value_counts() >= 3
             )
             use_for_max = pd.Series(
-                n_or_more[self.dds.design_matrix[self.dds.design_matrix.columns[-1]]]
+                n_or_more[self.dds.design_matrix.iloc[:, self.contrast_idx]]
             )
             use_for_max.index = self.dds.design_matrix.index
 
-            # Take into account whether we already replaced outliers
-            if self.dds.refit_cooks and self.dds.replaced.sum() > 0:
-                cooks_outlier = (
-                    self.dds.replace_cooks.loc[:, use_for_max] > cooks_cutoff
-                ).any(axis=1)
+        else:
+            use_for_max = pd.Series(True, index=self.dds.design_matrix.index)
 
-            else:
-                cooks_outlier = (self.dds.cooks.loc[:, use_for_max] > cooks_cutoff).any(
-                    axis=1
-                )
+        # Take into account whether we already replaced outliers
+        if self.dds.refit_cooks and self.dds.replaced.sum() > 0:
+            cooks_outlier = (
+                self.dds.replace_cooks.loc[:, use_for_max] > cooks_cutoff
+            ).any(axis=1)
 
-            pos = self.dds.cooks[cooks_outlier].to_numpy().argmax(1)
-            cooks_outlier.update(
-                (
-                    self.dds.counts.loc[:, cooks_outlier]
-                    > self.dds.counts.loc[:, cooks_outlier].to_numpy()[
-                        pos, np.arange(len(pos))
-                    ]
-                ).sum(0)
-                < 3
+        else:
+            cooks_outlier = (self.dds.cooks.loc[:, use_for_max] > cooks_cutoff).any(
+                axis=1
             )
-            self.p_values[cooks_outlier] = np.nan
+
+        pos = self.dds.cooks[cooks_outlier].to_numpy().argmax(1)
+        cooks_outlier.update(
+            (
+                self.dds.counts.loc[:, cooks_outlier]
+                > self.dds.counts.loc[:, cooks_outlier].to_numpy()[
+                    pos, np.arange(len(pos))
+                ]
+            ).sum(0)
+            < 3
+        )
+
+        self.p_values[cooks_outlier] = np.nan
 
     def _fit_prior_var(self, min_var=1e-6, max_var=400):
         """Estimate the prior variance of the apeGLM model.

From 442bac30df56513e9c05ba1d60d493cea4e32f15 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Tue, 3 Jan 2023 12:17:15 +0100
Subject: [PATCH 07/25] feat: add multifactor support for apeglm lfc shrinkage

---
 pydeseq2/DeseqStats.py | 37 +++++++++++++++++++++++++--------
 pydeseq2/utils.py      | 47 ++++++++++++++++++++++++++++--------------
 2 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py
index 19f5ab6b..53cdbb26 100644
--- a/pydeseq2/DeseqStats.py
+++ b/pydeseq2/DeseqStats.py
@@ -217,6 +217,7 @@ def run_wald_test(self):
                 alternative_level = "_".join([self.contrast[0], self.contrast[2]])
                 self.contrast_idx = self.LFCs.columns.get_loc(alternative_level)
                 self.LFCs.iloc[:, self.contrast_idx] *= -1
+                self._change_lfc_sign = True
                 pass
 
         # Compute means according to the LFC model.
@@ -269,6 +270,7 @@ def lfc_shrink(self):
         """LFC shrinkage with an apeGLM prior [2]_.
 
         Shrinks LFCs using a heavy-tailed Cauchy prior, leaving p-values unchanged.
+        TODO : for now, shrinks the LFCs of the variable given by self.contrast
 
         Returns
         -------
@@ -306,6 +308,8 @@ def lfc_shrink(self):
                     offset,
                     prior_no_shrink_scale,
                     prior_scale,
+                    "L-BFGS-B",
+                    self.contrast_idx,
                 )
                 for i in range(m)
             )
@@ -314,27 +318,43 @@ def lfc_shrink(self):
 
         lfcs, inv_hessians, l_bfgs_b_converged_ = zip(*res)
 
-        self.LFCs.iloc[:, 1] = pd.Series(
-            np.array(lfcs)[:, 1], index=self.dds.dispersions[nonzero].index
+        self.LFCs.iloc[:, self.contrast_idx] = pd.Series(
+            np.array(lfcs)[:, self.contrast_idx],
+            index=self.dds.dispersions[nonzero].index,
         )
         self.SE = pd.Series(
-            np.array([np.sqrt(np.abs(inv_hess[1, 1])) for inv_hess in inv_hessians]),
+            np.array(
+                [
+                    np.sqrt(np.abs(inv_hess[self.contrast_idx, self.contrast_idx]))
+                    for inv_hess in inv_hessians
+                ]
+            ),
             index=self.dds.dispersions[nonzero].index,
         )
         self._lcf_shrink_converged = pd.Series(
             np.array(l_bfgs_b_converged_), index=self.dds.dispersions[nonzero].index
         )
 
+        # Change sign to comply with contrast, if needed
+        if self._change_lfc_sign:
+            self.LFCs.iloc[:, self.contrast_idx] *= -1
+
         # Set a flag to indicate that LFCs were shrunk
         self.shrunk_LFCs = True
 
         # Replace in results dataframe, if it exists
         if hasattr(self, "results_df"):
-            self.results_df["log2FoldChange"] = self.LFCs[
-                self.dds.design_matrix.columns[-1]
+            self.results_df["log2FoldChange"] = self.LFCs.iloc[
+                :, self.contrast_idx
             ] / np.log(2)
             self.results_df["lfcSE"] = self.SE / np.log(2)
-            return self.results_df
+
+            print(
+                f"Shrunk Log2 fold change & Wald test p-value: "
+                f"{self.contrast[0]} {self.contrast[1]} vs {self.contrast[2]}"
+            )
+
+            display(self.results_df)
 
     def _independent_filtering(self):
         """Compute adjusted p-values using independent filtering.
@@ -468,9 +488,8 @@ def _fit_prior_var(self, min_var=1e-6, max_var=400):
             Estimated prior variance.
         """
 
-        # TODO : adapt to multifactor
-        keep = ~self.LFCs.iloc[:, 1].isna()
-        S = self.LFCs[keep].iloc[:, 1] ** 2
+        keep = ~self.LFCs.iloc[:, self.contrast_idx].isna()
+        S = self.LFCs[keep].iloc[:, self.contrast_idx] ** 2
         D = self.SE[keep] ** 2
 
         def objective(a):
diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
index becec9b0..611c4403 100644
--- a/pydeseq2/utils.py
+++ b/pydeseq2/utils.py
@@ -950,18 +950,25 @@ def nbinomGLM(
     converged: bool
         Whether L-BFGS-B converged.
     """
-    beta_init = np.array([0.1, -0.1])
-    no_shrink_index = 1 - shrink_index
+
+    p = design_matrix.shape[-1]
+
+    shrink_mask = np.zeros(p)
+    shrink_mask[shrink_index] = 1
+    no_shrink_mask = np.ones(p) - shrink_mask
+
+    beta_init = np.ones(p) * 0.1 * (-1) ** (np.arange(p))
 
     # Set optimization scale
     scale_cnst = nbinomFn(
-        np.array([0, 0]),
+        np.zeros(p),
         design_matrix,
         counts,
         size,
         offset,
         prior_no_shrink_scale,
         prior_scale,
+        shrink_index,
     )
     scale_cnst = np.maximum(scale_cnst, 1)
 
@@ -976,6 +983,7 @@ def f(beta, cnst=scale_cnst):
                 offset,
                 prior_no_shrink_scale,
                 prior_scale,
+                shrink_index,
             )
             / cnst
         )
@@ -983,11 +991,9 @@ def f(beta, cnst=scale_cnst):
     def df(beta, cnst=scale_cnst):
         # Gradient of the function to optimize
         xbeta = design_matrix @ beta
-        d_neg_prior = np.array(
-            [
-                beta[no_shrink_index] / prior_no_shrink_scale**2,
-                2 * beta[shrink_index] / (prior_scale**2 + beta[shrink_index] ** 2),
-            ]
+        d_neg_prior = (
+            beta * no_shrink_mask / prior_no_shrink_scale**2
+            + 2 * beta * shrink_mask / (prior_scale**2 + beta[shrink_index] ** 2),
         )
 
         d_nll = (
@@ -998,18 +1004,21 @@ def df(beta, cnst=scale_cnst):
 
     def ddf(beta, cnst=scale_cnst):
         # Hessian of the function to optimize
+        # Note: will only work if there is a single shrink index
         xbeta = design_matrix @ beta
         exp_xbeta_off = np.exp(xbeta + offset)
         frac = (counts + size) * size * exp_xbeta_off / (size + exp_xbeta_off) ** 2
+        # Build diagonal
         h11 = 1 / prior_no_shrink_scale**2
         h22 = (
             2
             * (prior_scale**2 - beta[shrink_index] ** 2)
             / (prior_scale**2 + beta[shrink_index] ** 2) ** 2
         )
-        return (
-            1 / cnst * ((design_matrix.T * frac) @ design_matrix + np.diag([h11, h22]))
-        )
+
+        h = np.diag(no_shrink_mask * h11 + shrink_mask * h22)
+
+        return 1 / cnst * ((design_matrix.T * frac) @ design_matrix + np.diag(h))
 
     res = minimize(
         f,
@@ -1022,8 +1031,9 @@ def ddf(beta, cnst=scale_cnst):
     beta = res.x
     converged = res.success
 
-    if not converged:
+    if not converged and p == 2:
         # If the solver failed, fit using grid search (slow)
+        # Only for single-factor analysis
         beta = grid_fit_shrink_beta(
             counts,
             offset,
@@ -1088,11 +1098,16 @@ def nbinomFn(
         Sum of the NB negative likelihood and apeGLM prior.
     """
 
-    no_shrink_index = 1 - shrink_index
+    p = design_matrix.shape[-1]
+
+    shrink_mask = np.zeros(p)
+    shrink_mask[shrink_index] = 1
+    no_shrink_mask = np.ones(p) - shrink_mask
+
     xbeta = design_matrix @ beta
-    prior = beta[no_shrink_index] ** 2 / (2 * prior_no_shrink_scale**2) + np.log1p(
-        (beta[shrink_index] / prior_scale) ** 2
-    )
+    prior = (
+        (beta * no_shrink_mask) ** 2 / (2 * prior_no_shrink_scale**2)
+    ).sum() + np.log1p((beta[shrink_index] / prior_scale) ** 2)
 
     nll = (
         counts * xbeta - (counts + size) * np.logaddexp(xbeta + offset, np.log(size))

From 3f688cec8bdce6e0a039ff48f3a7df0bf1e4afe1 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Wed, 4 Jan 2023 14:46:06 +0100
Subject: [PATCH 08/25] fix: convert design_factor to list if a string is
 provided

---
 pydeseq2/DeseqDataSet.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py
index 7184824b..27f2d5f2 100644
--- a/pydeseq2/DeseqDataSet.py
+++ b/pydeseq2/DeseqDataSet.py
@@ -182,7 +182,9 @@ def __init__(
             raise ValueError(
                 "The count matrix and clinical data should have the same index."
             )
-        self.design_factor = design_factor
+        self.design_factor = (
+            [design_factor] if isinstance(design_factor, str) else design_factor
+        )
         if self.clinical[self.design_factor].isna().any().any():
             raise ValueError("NaNs are not allowed in the design factor.")
         self.clinical[self.design_factor] = self.clinical[self.design_factor].astype(str)

From 5c952f39f0bc822359dd08993b9a5fe92b631c07 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Thu, 5 Jan 2023 11:36:44 +0100
Subject: [PATCH 09/25] fix: correct mu initialisation in multi-factor case

---
 pydeseq2/DeseqDataSet.py | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py
index 27f2d5f2..c6ec59b0 100644
--- a/pydeseq2/DeseqDataSet.py
+++ b/pydeseq2/DeseqDataSet.py
@@ -251,6 +251,8 @@ def fit_genewise_dispersions(self):
         Fits a negative binomial per gene, independently.
         """
 
+        p = self.design_matrix.shape[-1]
+
         # Check that size factors are available. If not, compute them.
         if not hasattr(self, "size_factors"):
             self.fit_size_factors()
@@ -269,22 +271,43 @@ def fit_genewise_dispersions(self):
         rough_disps = self._rough_dispersions.loc[self.non_zero_genes].values
         size_factors = self.size_factors.values
 
-        with parallel_backend("loky", inner_max_num_threads=1):
-            mu_hat_ = np.array(
-                Parallel(
+        if len(self.design_matrix.value_counts()) == p:
+            with parallel_backend("loky", inner_max_num_threads=1):
+                mu_hat_ = np.array(
+                    Parallel(
+                        n_jobs=self.n_processes,
+                        verbose=self.joblib_verbosity,
+                        batch_size=self.batch_size,
+                    )(
+                        delayed(fit_lin_mu)(
+                            counts_nonzero[:, i],
+                            size_factors,
+                            X,
+                            self.min_mu,
+                        )
+                        for i in range(m)
+                    )
+                )
+        else:
+            with parallel_backend("loky", inner_max_num_threads=1):
+                res = Parallel(
                     n_jobs=self.n_processes,
                     verbose=self.joblib_verbosity,
                     batch_size=self.batch_size,
                 )(
-                    delayed(fit_lin_mu)(
+                    delayed(irls_solver)(
                         counts_nonzero[:, i],
                         size_factors,
                         X,
+                        rough_disps[i],
                         self.min_mu,
+                        self.beta_tol,
                     )
                     for i in range(m)
                 )
-            )
+
+                _, mu_hat_, _, _ = zip(*res)
+                mu_hat_ = np.array(mu_hat_)
 
         self._mu_hat = pd.DataFrame(
             mu_hat_.T,

From e31af6f503a0e90578df8bbc367e87b9916efcb3 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Thu, 5 Jan 2023 11:40:41 +0100
Subject: [PATCH 10/25] fix: fix design matrix building in multifactor case

---
 pydeseq2/utils.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
index 611c4403..d3855a0c 100644
--- a/pydeseq2/utils.py
+++ b/pydeseq2/utils.py
@@ -73,7 +73,7 @@ def load_data(
     # Load data
     if cancer_type == "synthetic":
         datasets_path = Path(pydeseq2.__file__).parent.parent / "tests"
-        path_to_data = datasets_path / "data"
+        path_to_data = datasets_path / "data/single_factor/"
 
         if modality == "raw_counts":
             df = pd.read_csv(
@@ -192,7 +192,6 @@ def build_design_matrix(
         ref = "_".join([factor, np.sort(np.unique(clinical_df[factor]).astype(str))[0]])
         ref_level = design_matrix.pop(ref)
     else:  # Put reference level last
-        ref = "_".join([factor, ref])
         try:
             ref_level = design_matrix.pop(f"{factor}_{ref}")
         except KeyError as e:
@@ -315,7 +314,7 @@ def irls_solver(
     min_beta=-30,
     max_beta=30,
     optimizer="L-BFGS-B",
-    maxiter=100,
+    maxiter=250,
 ):
     r"""Fit a NB GLM wit log-link to predict counts from the design matrix.
 
@@ -684,6 +683,7 @@ def trimmed_variance(x, trim=0.125, axis=1):
     return 1.51 * trimmed_mean(sqerror, trim=trim, axis=axis)
 
 
+# TODO : bug here in multi-factor?
 def fit_lin_mu(y, size_factors, X, min_mu=0.5):
     """Estimate mean of negative binomial model using a linear regression.
 
@@ -760,10 +760,8 @@ def wald_test(X, disp, lfc, mu, D, idx=-1):
     H = np.linalg.inv(M + D)
     S = H @ M @ H
     # Evaluate standard error and Wald statistic
-    c = np.zeros(lfc.shape)
-    c[idx] = 1
-    wald_se = np.sqrt(c.T @ S @ c)
-    wald_statistic = c.T @ lfc / wald_se
+    wald_se = np.sqrt(S[idx, idx])
+    wald_statistic = lfc[idx] / wald_se
     wald_p_value = 2 * norm.sf(np.abs(wald_statistic))
     return wald_p_value, wald_statistic, wald_se
 

From e850e5e5a98f07de90b2368138f85c54fad3f43f Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Thu, 5 Jan 2023 14:32:26 +0100
Subject: [PATCH 11/25] feat: add multifactor for shrinkage

---
 pydeseq2/DeseqStats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py
index 53cdbb26..cd017b4c 100644
--- a/pydeseq2/DeseqStats.py
+++ b/pydeseq2/DeseqStats.py
@@ -144,6 +144,7 @@ def __init__(
         self.n_processes = get_num_processes(n_cpus)
         self.batch_size = batch_size
         self.joblib_verbosity = joblib_verbosity
+        self._change_lfc_sign = False
 
     def summary(self):
         """Run the statistical analysis.
@@ -270,7 +271,6 @@ def lfc_shrink(self):
         """LFC shrinkage with an apeGLM prior [2]_.
 
         Shrinks LFCs using a heavy-tailed Cauchy prior, leaving p-values unchanged.
-        TODO : for now, shrinks the LFCs of the variable given by self.contrast
 
         Returns
         -------

From 416c817458a367c6581eb812e3af220ea8eff759 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Thu, 5 Jan 2023 14:42:21 +0100
Subject: [PATCH 12/25] docs: fix docstrings

---
 pydeseq2/DeseqStats.py | 2 +-
 pydeseq2/utils.py      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py
index cd017b4c..f43431aa 100644
--- a/pydeseq2/DeseqStats.py
+++ b/pydeseq2/DeseqStats.py
@@ -33,7 +33,7 @@ class DeseqStats:
         A list of three strings, in the following format:
         ['variable_of_interest', 'tested_level', 'reference_level'].
         Names must correspond to the clinical data passed to the DeseqDataSet.
-        (default: None)
+        (default: None).
 
     alpha : float
         P-value and adjusted p-value significance threshold (usually 0.05).
diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
index d3855a0c..ed56c46e 100644
--- a/pydeseq2/utils.py
+++ b/pydeseq2/utils.py
@@ -183,7 +183,7 @@ def build_design_matrix(
     for factor in design:
         # Check that each factor has exactly 2 levels
         if len(np.unique(clinical_df[factor])) != 2:
-            print(
+            raise ValueError(
                 f"Factors should take exactly two values, but {factor} "
                 f"takes values {np.unique(clinical_df[factor])}."
             )
@@ -355,7 +355,7 @@ def irls_solver(
 
     maxiter : int
         Maximum number of IRLS iterations to perform before switching to L-BFGS-B.
-        (default: 100)
+        (default: 250).
 
     Returns
     -------

From 0c95c24bed8147090a847e8f97823eaec49b80c1 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Thu, 5 Jan 2023 14:43:55 +0100
Subject: [PATCH 13/25] CI: adapt tests to multi-factor

---
 .../data/multi_factor/r_test_dispersions.csv  |  11 ++
 .../multi_factor/r_test_lfc_shrink_res.csv    |  11 ++
 tests/data/multi_factor/r_test_res.csv        |  11 ++
 .../data/multi_factor/r_test_size_factors.csv | 101 +++++++++++++++
 tests/data/multi_factor/test_clinical.csv     | 101 +++++++++++++++
 tests/data/multi_factor/test_counts.csv       |  11 ++
 .../r_test_dispersions.csv                    |   0
 .../r_test_lfc_shrink_res.csv                 |   0
 tests/data/{ => single_factor}/r_test_res.csv |   0
 .../r_test_size_factors.csv                   |   0
 .../{ => single_factor}/test_clinical.csv     |   0
 .../data/{ => single_factor}/test_counts.csv  |   0
 tests/test_edge_cases.py                      |   7 +-
 tests/test_pydeseq2.py                        | 122 ++++++++++++++++--
 14 files changed, 359 insertions(+), 16 deletions(-)
 create mode 100644 tests/data/multi_factor/r_test_dispersions.csv
 create mode 100644 tests/data/multi_factor/r_test_lfc_shrink_res.csv
 create mode 100644 tests/data/multi_factor/r_test_res.csv
 create mode 100644 tests/data/multi_factor/r_test_size_factors.csv
 create mode 100644 tests/data/multi_factor/test_clinical.csv
 create mode 100644 tests/data/multi_factor/test_counts.csv
 rename tests/data/{ => single_factor}/r_test_dispersions.csv (100%)
 rename tests/data/{ => single_factor}/r_test_lfc_shrink_res.csv (100%)
 rename tests/data/{ => single_factor}/r_test_res.csv (100%)
 rename tests/data/{ => single_factor}/r_test_size_factors.csv (100%)
 rename tests/data/{ => single_factor}/test_clinical.csv (100%)
 rename tests/data/{ => single_factor}/test_counts.csv (100%)

diff --git a/tests/data/multi_factor/r_test_dispersions.csv b/tests/data/multi_factor/r_test_dispersions.csv
new file mode 100644
index 00000000..605f3f73
--- /dev/null
+++ b/tests/data/multi_factor/r_test_dispersions.csv
@@ -0,0 +1,11 @@
+"","x"
+"gene1",0.571719582382478
+"gene2",0.241375722473924
+"gene3",0.913605156809054
+"gene4",0.127658580621073
+"gene5",0.211606088717701
+"gene6",0.837078542249143
+"gene7",0.274382560982857
+"gene8",0.186512153143256
+"gene9",0.170002121905015
+"gene10",0.342711171818431
diff --git a/tests/data/multi_factor/r_test_lfc_shrink_res.csv b/tests/data/multi_factor/r_test_lfc_shrink_res.csv
new file mode 100644
index 00000000..bcd28eda
--- /dev/null
+++ b/tests/data/multi_factor/r_test_lfc_shrink_res.csv
@@ -0,0 +1,11 @@
+"","baseMean","log2FoldChange","lfcSE","pvalue","padj"
+"gene1",9.44232157508205,0.389505312989051,0.233070832511343,0.0331949459795556,0.0663898919591112
+"gene2",26.1212486463654,0.516704007917007,0.15435772629484,0.000225787635678376,0.000752625452261255
+"gene3",4.62191544640228,0.401164044699898,0.299289163191974,0.0514994764604093,0.0801582060957779
+"gene4",82.9432080307415,-0.750790557729475,0.110197174492455,1.70031075718514e-12,1.70031075718514e-11
+"gene5",33.4270160403599,0.455162916421329,0.142621472299145,0.000479064424940753,0.00119766106235188
+"gene6",4.62829876327365,0.168302927105627,0.240775492001819,0.334331479132399,0.371479421258221
+"gene7",29.9443838771296,-0.0763398443562887,0.14657708559393,0.558527389642911,0.558527389642911
+"gene8",41.9394265054559,-0.675597885085072,0.13474015798614,9.57931201208725e-08,4.78965600604363e-07
+"gene9",40.5509417857322,-0.217683996486262,0.122943395355584,0.0561107442670445,0.0801582060957779
+"gene10",12.6444689173846,0.224250777853187,0.174611123633433,0.133787774158417,0.167234717698021
diff --git a/tests/data/multi_factor/r_test_res.csv b/tests/data/multi_factor/r_test_res.csv
new file mode 100644
index 00000000..ca867cba
--- /dev/null
+++ b/tests/data/multi_factor/r_test_res.csv
@@ -0,0 +1,11 @@
+"","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj"
+"gene1",9.44232157508205,0.508071523102971,0.238562875490962,2.12971746780534,0.0331949459795556,0.0663898919591112
+"gene2",26.1212486463654,0.565039762155875,0.153199302487514,3.68826589273752,0.000225787635678376,0.000752625452261255
+"gene3",4.62191544640228,0.602314731765745,0.30930854668438,1.947294176712,0.0514994764604093,0.0801582060957779
+"gene4",82.9432080307415,-0.770740313328172,0.109215102865638,-7.05708544976949,1.70031075718514e-12,1.70031075718514e-11
+"gene5",33.4270160403599,0.49815287339304,0.142647406538608,3.49219719783839,0.000479064424940753,0.00119766106235188
+"gene6",4.62829876327365,0.28690257968296,0.297177180644456,0.965426009698275,0.334331479132399,0.371479421258221
+"gene7",29.9443838771296,-0.0937974387199766,0.160329216758082,-0.585030231024618,0.558527389642911,0.558527389642911
+"gene8",41.9394265054559,-0.711205449520001,0.133321144309024,-5.33452854163556,9.57931201208725e-08,4.78965600604363e-07
+"gene9",40.5509417857322,-0.243736404799899,0.127599022943467,-1.91017453878065,0.0561107442670445,0.0801582060957779
+"gene10",12.6444689173846,0.2817116030926,0.187891530240353,1.49933103813797,0.133787774158417,0.167234717698021
diff --git a/tests/data/multi_factor/r_test_size_factors.csv b/tests/data/multi_factor/r_test_size_factors.csv
new file mode 100644
index 00000000..8d60910e
--- /dev/null
+++ b/tests/data/multi_factor/r_test_size_factors.csv
@@ -0,0 +1,101 @@
+"","x"
+"sample1",0.94383204955008
+"sample2",1.04976550254112
+"sample3",0.911638462733079
+"sample4",1.29876776473799
+"sample5",0.679595312414821
+"sample6",0.764054516302446
+"sample7",0.801706248933874
+"sample8",1.1696677607208
+"sample9",1.00580106237394
+"sample10",0.999052126721534
+"sample11",0.988776432861989
+"sample12",1.25484456354867
+"sample13",0.857756357861922
+"sample14",0.842698187394378
+"sample15",0.779778507147202
+"sample16",1.28385972471753
+"sample17",0.994514686617904
+"sample18",1.79349056643856
+"sample19",1.32455815041249
+"sample20",0.752607424834414
+"sample21",0.731992662070059
+"sample22",1.87146841715328
+"sample23",1.29876776473799
+"sample24",0.974723133934002
+"sample25",0.788330562401193
+"sample26",1.48157916357968
+"sample27",0.779778507147202
+"sample28",1.22327156234668
+"sample29",1.19608774985009
+"sample30",1.07726160809632
+"sample31",0.89914691404938
+"sample32",1.09168991000608
+"sample33",0.801706248933874
+"sample34",0.605037341533548
+"sample35",1.40360131286496
+"sample36",1.19608774985009
+"sample37",1.16855396610962
+"sample38",1.28385972471753
+"sample39",0.828762238848253
+"sample40",0.898887666238171
+"sample41",1.33200681233305
+"sample42",1.11453631236031
+"sample43",1.22327156234668
+"sample44",0.94383204955008
+"sample45",0.89914691404938
+"sample46",0.994514686617904
+"sample47",0.95143343738075
+"sample48",1.19886255206584
+"sample49",0.801136830886645
+"sample50",1.70788656585253
+"sample51",0.697135868638152
+"sample52",1.02214009457951
+"sample53",1.07866519948581
+"sample54",1.46414662196525
+"sample55",0.764054516302446
+"sample56",2.32053426877511
+"sample57",1.30482299983646
+"sample58",0.89914691404938
+"sample59",0.799241701377227
+"sample60",1.4985781900823
+"sample61",0.853943282926263
+"sample62",1.22327156234668
+"sample63",0.911638462733079
+"sample64",1.39867297741015
+"sample65",1.12360958279771
+"sample66",1.09895733939369
+"sample67",0.857756357861922
+"sample68",1.1417201248569
+"sample69",0.896745283219282
+"sample70",0.974723133934002
+"sample71",1.21349834942153
+"sample72",1.11453631236031
+"sample73",0.733962937408007
+"sample74",0.71826060700182
+"sample75",1.04570380295723
+"sample76",0.731992662070059
+"sample77",0.605037341533548
+"sample78",0.815514374897786
+"sample79",1.65821256229216
+"sample80",0.773511422925036
+"sample81",1.46094382467857
+"sample82",0.988776432861989
+"sample83",0.801706248933874
+"sample84",0.699336488705073
+"sample85",0.94383204955008
+"sample86",0.71826060700182
+"sample87",0.799241701377227
+"sample88",1.04976550254112
+"sample89",0.924249624884157
+"sample90",1.28970135698058
+"sample91",0.911638462733079
+"sample92",0.975990216093412
+"sample93",0.857756357861922
+"sample94",0.699336488705073
+"sample95",0.801136830886645
+"sample96",1.61799779922871
+"sample97",1.20865668607816
+"sample98",0.974723133934002
+"sample99",0.557708694910521
+"sample100",1.25844273273344
diff --git a/tests/data/multi_factor/test_clinical.csv b/tests/data/multi_factor/test_clinical.csv
new file mode 100644
index 00000000..71a24451
--- /dev/null
+++ b/tests/data/multi_factor/test_clinical.csv
@@ -0,0 +1,101 @@
+"","condition","group"
+"sample1","A","X"
+"sample2","A","Y"
+"sample3","A","X"
+"sample4","A","Y"
+"sample5","A","X"
+"sample6","A","Y"
+"sample7","A","X"
+"sample8","A","Y"
+"sample9","A","X"
+"sample10","A","Y"
+"sample11","A","X"
+"sample12","A","Y"
+"sample13","A","X"
+"sample14","A","Y"
+"sample15","A","X"
+"sample16","A","Y"
+"sample17","A","X"
+"sample18","A","Y"
+"sample19","A","X"
+"sample20","A","Y"
+"sample21","A","X"
+"sample22","A","Y"
+"sample23","A","X"
+"sample24","A","Y"
+"sample25","A","X"
+"sample26","A","Y"
+"sample27","A","X"
+"sample28","A","Y"
+"sample29","A","X"
+"sample30","A","Y"
+"sample31","A","X"
+"sample32","A","Y"
+"sample33","A","X"
+"sample34","A","Y"
+"sample35","A","X"
+"sample36","A","Y"
+"sample37","A","X"
+"sample38","A","Y"
+"sample39","A","X"
+"sample40","A","Y"
+"sample41","A","X"
+"sample42","A","Y"
+"sample43","A","X"
+"sample44","A","Y"
+"sample45","A","X"
+"sample46","A","Y"
+"sample47","A","X"
+"sample48","A","Y"
+"sample49","A","X"
+"sample50","A","Y"
+"sample51","B","X"
+"sample52","B","Y"
+"sample53","B","X"
+"sample54","B","Y"
+"sample55","B","X"
+"sample56","B","Y"
+"sample57","B","X"
+"sample58","B","Y"
+"sample59","B","X"
+"sample60","B","Y"
+"sample61","B","X"
+"sample62","B","Y"
+"sample63","B","X"
+"sample64","B","Y"
+"sample65","B","X"
+"sample66","B","Y"
+"sample67","B","X"
+"sample68","B","Y"
+"sample69","B","X"
+"sample70","B","Y"
+"sample71","B","X"
+"sample72","B","Y"
+"sample73","B","X"
+"sample74","B","Y"
+"sample75","B","X"
+"sample76","B","Y"
+"sample77","B","X"
+"sample78","B","Y"
+"sample79","B","X"
+"sample80","B","Y"
+"sample81","B","X"
+"sample82","B","Y"
+"sample83","B","X"
+"sample84","B","Y"
+"sample85","B","X"
+"sample86","B","Y"
+"sample87","B","X"
+"sample88","B","Y"
+"sample89","B","X"
+"sample90","B","Y"
+"sample91","B","X"
+"sample92","B","Y"
+"sample93","B","X"
+"sample94","B","Y"
+"sample95","B","X"
+"sample96","B","Y"
+"sample97","B","X"
+"sample98","B","Y"
+"sample99","B","X"
+"sample100","B","Y"
diff --git a/tests/data/multi_factor/test_counts.csv b/tests/data/multi_factor/test_counts.csv
new file mode 100644
index 00000000..538ecd7c
--- /dev/null
+++ b/tests/data/multi_factor/test_counts.csv
@@ -0,0 +1,11 @@
+"","sample1","sample2","sample3","sample4","sample5","sample6","sample7","sample8","sample9","sample10","sample11","sample12","sample13","sample14","sample15","sample16","sample17","sample18","sample19","sample20","sample21","sample22","sample23","sample24","sample25","sample26","sample27","sample28","sample29","sample30","sample31","sample32","sample33","sample34","sample35","sample36","sample37","sample38","sample39","sample40","sample41","sample42","sample43","sample44","sample45","sample46","sample47","sample48","sample49","sample50","sample51","sample52","sample53","sample54","sample55","sample56","sample57","sample58","sample59","sample60","sample61","sample62","sample63","sample64","sample65","sample66","sample67","sample68","sample69","sample70","sample71","sample72","sample73","sample74","sample75","sample76","sample77","sample78","sample79","sample80","sample81","sample82","sample83","sample84","sample85","sample86","sample87","sample88","sample89","sample90","sample91","sample92","sample93","sample94","sample95","sample96","sample97","sample98","sample99","sample100"
+"gene1",12,1,0,3,1,12,0,22,15,8,5,9,18,10,29,2,2,7,0,6,5,12,11,12,4,17,11,2,1,15,13,8,2,4,17,6,9,4,0,18,5,4,7,1,8,3,6,5,9,14,14,13,5,9,2,14,24,11,4,7,6,4,2,3,11,5,12,11,21,5,6,32,15,7,15,23,7,33,24,17,4,5,16,7,9,33,4,5,9,11,9,7,5,10,2,8,6,6,3,2
+"gene2",21,47,9,16,10,17,13,30,37,9,22,16,11,41,13,32,10,21,17,10,14,26,12,10,23,37,26,17,37,15,12,14,8,20,16,21,26,9,18,20,62,16,41,21,27,20,39,28,21,38,36,33,24,39,17,59,37,39,26,29,19,47,31,57,25,18,2,15,26,26,27,11,13,68,33,58,42,30,46,20,12,22,7,6,21,21,25,40,48,38,24,33,17,51,31,36,32,10,42,28
+"gene3",4,2,4,1,11,4,5,1,8,5,7,0,10,10,7,8,3,0,6,1,0,1,3,5,3,2,2,4,1,0,0,4,3,2,2,1,12,4,10,3,1,7,0,1,0,3,3,1,3,0,31,0,1,11,2,7,2,2,10,8,0,8,11,2,4,12,4,11,10,1,4,10,1,0,1,4,9,1,18,3,2,1,7,6,3,12,0,3,2,7,0,2,0,0,5,13,12,2,0,5
+"gene4",130,43,119,42,78,80,97,55,150,45,177,104,70,62,141,87,205,122,96,51,134,57,172,57,160,50,71,52,197,73,265,35,120,41,103,51,219,87,199,60,108,57,135,44,173,67,179,95,141,98,57,22,80,50,84,41,125,33,80,38,108,32,61,30,64,50,45,15,98,8,94,54,65,26,58,29,41,50,110,44,99,40,71,29,112,47,97,48,32,22,119,50,104,38,52,33,77,62,25,43
+"gene5",18,14,34,50,18,16,23,16,20,72,23,36,15,71,16,26,4,74,38,18,21,58,10,27,30,60,21,43,14,50,22,25,23,15,17,30,22,16,39,30,29,42,21,11,23,62,23,12,13,44,20,57,40,21,26,72,13,57,12,79,43,40,20,13,47,65,57,63,10,44,52,67,20,33,30,21,23,58,76,91,44,33,23,58,23,36,21,64,8,37,21,28,27,23,38,48,14,73,16,51
+"gene6",0,10,9,1,0,2,2,5,0,0,7,16,7,0,6,12,1,18,2,12,7,0,9,2,0,2,0,0,0,2,6,3,2,8,3,3,11,0,4,1,1,6,2,2,0,9,6,1,8,2,6,0,6,3,1,4,4,4,4,2,12,1,5,2,10,0,0,4,4,4,3,17,9,5,1,2,5,4,4,10,1,2,7,1,4,4,3,1,6,1,8,8,3,8,2,6,18,9,5,6
+"gene7",16,70,20,46,10,18,9,30,15,24,47,51,22,9,20,26,8,46,26,41,39,48,18,25,15,38,20,18,17,29,25,28,39,83,36,61,44,61,14,32,58,21,27,46,46,18,47,22,12,52,33,11,15,17,16,27,27,9,48,53,15,12,27,53,19,69,22,42,23,25,25,21,55,22,27,11,48,19,24,14,22,13,46,13,25,38,23,11,71,41,17,25,22,42,12,45,31,25,6,37
+"gene8",54,38,33,75,51,58,21,88,62,103,8,41,55,11,55,17,36,72,53,87,16,69,96,95,26,35,14,104,69,102,31,49,14,121,70,83,63,60,30,56,31,51,52,62,28,36,18,62,29,124,10,37,25,53,20,84,46,32,26,55,9,21,33,41,38,15,28,41,35,33,41,55,12,26,44,28,13,45,29,28,41,25,20,54,23,26,19,38,38,36,33,56,15,16,29,44,19,29,14,26
+"gene9",49,65,32,37,25,25,48,38,37,116,36,86,30,31,24,80,46,43,53,69,24,61,66,56,29,57,12,45,44,38,35,99,74,9,60,44,28,52,19,18,49,41,45,94,24,38,35,42,59,35,14,35,41,57,25,50,48,66,26,35,18,45,12,56,55,80,53,42,12,64,52,41,27,10,19,22,18,30,61,12,72,46,23,43,50,24,23,10,34,26,18,69,27,25,40,61,58,52,25,44
+"gene10",3,10,13,13,9,24,19,25,7,10,14,9,19,2,11,15,10,15,7,6,10,21,13,11,6,1,21,21,2,7,9,48,7,4,3,20,10,11,18,1,6,17,6,6,9,15,7,12,2,26,2,38,3,20,23,38,29,9,8,15,29,15,14,14,18,11,9,14,1,7,12,20,21,7,8,8,4,8,19,29,16,10,15,7,3,4,8,16,4,30,31,6,12,7,8,12,17,20,10,15
diff --git a/tests/data/r_test_dispersions.csv b/tests/data/single_factor/r_test_dispersions.csv
similarity index 100%
rename from tests/data/r_test_dispersions.csv
rename to tests/data/single_factor/r_test_dispersions.csv
diff --git a/tests/data/r_test_lfc_shrink_res.csv b/tests/data/single_factor/r_test_lfc_shrink_res.csv
similarity index 100%
rename from tests/data/r_test_lfc_shrink_res.csv
rename to tests/data/single_factor/r_test_lfc_shrink_res.csv
diff --git a/tests/data/r_test_res.csv b/tests/data/single_factor/r_test_res.csv
similarity index 100%
rename from tests/data/r_test_res.csv
rename to tests/data/single_factor/r_test_res.csv
diff --git a/tests/data/r_test_size_factors.csv b/tests/data/single_factor/r_test_size_factors.csv
similarity index 100%
rename from tests/data/r_test_size_factors.csv
rename to tests/data/single_factor/r_test_size_factors.csv
diff --git a/tests/data/test_clinical.csv b/tests/data/single_factor/test_clinical.csv
similarity index 100%
rename from tests/data/test_clinical.csv
rename to tests/data/single_factor/test_clinical.csv
diff --git a/tests/data/test_counts.csv b/tests/data/single_factor/test_counts.csv
similarity index 100%
rename from tests/data/test_counts.csv
rename to tests/data/single_factor/test_counts.csv
diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py
index febbb294..4e85b9f3 100644
--- a/tests/test_edge_cases.py
+++ b/tests/test_edge_cases.py
@@ -18,10 +18,10 @@ def test_zero_genes():
     test_path = str(Path(os.path.realpath(tests.__file__)).parent.resolve())
 
     counts_df = pd.read_csv(
-        os.path.join(test_path, "data/test_counts.csv"), index_col=0
+        os.path.join(test_path, "data/single_factor/test_counts.csv"), index_col=0
     ).T
     clinical_df = pd.read_csv(
-        os.path.join(test_path, "data/test_clinical.csv"), index_col=0
+        os.path.join(test_path, "data/single_factor/test_clinical.csv"), index_col=0
     )
 
     n, m = counts_df.shape
@@ -40,7 +40,8 @@ def test_zero_genes():
     assert dds.LFCs.loc[zero_genes].isna().all().all()
 
     res = DeseqStats(dds)
-    res_df = res.summary()
+    res.summary()
+    res_df = res.results_df
 
     # check that the corresponding stats are NaN
     assert (res_df.loc[zero_genes].baseMean == 0).all()
diff --git a/tests/test_pydeseq2.py b/tests/test_pydeseq2.py
index 77d0b67b..08a161e1 100644
--- a/tests/test_pydeseq2.py
+++ b/tests/test_pydeseq2.py
@@ -8,6 +8,8 @@
 from pydeseq2.DeseqDataSet import DeseqDataSet
 from pydeseq2.DeseqStats import DeseqStats
 
+# Single-factor tests
+
 
 def test_deseq(tol=0.02):
     """Test that the outputs of the DESeq2 function match those of the original R
@@ -17,19 +19,22 @@ def test_deseq(tol=0.02):
     test_path = str(Path(os.path.realpath(tests.__file__)).parent.resolve())
 
     counts_df = pd.read_csv(
-        os.path.join(test_path, "data/test_counts.csv"), index_col=0
+        os.path.join(test_path, "data/single_factor/test_counts.csv"), index_col=0
     ).T
     clinical_df = pd.read_csv(
-        os.path.join(test_path, "data/test_clinical.csv"), index_col=0
+        os.path.join(test_path, "data/single_factor/test_clinical.csv"), index_col=0
     )
 
-    r_res = pd.read_csv(os.path.join(test_path, "data/r_test_res.csv"), index_col=0)
+    r_res = pd.read_csv(
+        os.path.join(test_path, "data/single_factor/r_test_res.csv"), index_col=0
+    )
 
     dds = DeseqDataSet(counts_df, clinical_df, design_factor="condition")
     dds.deseq2()
 
     res = DeseqStats(dds)
-    res_df = res.summary()
+    res.summary()
+    res_df = res.results_df
 
     # check that the same p-values are NaN
     assert (res_df.pvalue.isna() == r_res.pvalue.isna()).all()
@@ -49,25 +54,29 @@ def test_lfc_shrinkage(tol=0.02):
     """
 
     test_path = str(Path(os.path.realpath(tests.__file__)).parent.resolve())
-    r_res = pd.read_csv(os.path.join(test_path, "data/r_test_res.csv"), index_col=0)
+    r_res = pd.read_csv(
+        os.path.join(test_path, "data/single_factor/r_test_res.csv"), index_col=0
+    )
     r_shrunk_res = pd.read_csv(
-        os.path.join(test_path, "data/r_test_lfc_shrink_res.csv"), index_col=0
+        os.path.join(test_path, "data/single_factor/r_test_lfc_shrink_res.csv"),
+        index_col=0,
     )
 
     counts_df = pd.read_csv(
-        os.path.join(test_path, "data/test_counts.csv"), index_col=0
+        os.path.join(test_path, "data/single_factor/test_counts.csv"), index_col=0
     ).T
 
     clinical_df = pd.read_csv(
-        os.path.join(test_path, "data/test_clinical.csv"), index_col=0
+        os.path.join(test_path, "data/single_factor/test_clinical.csv"), index_col=0
     )
 
     r_size_factors = pd.read_csv(
-        os.path.join(test_path, "data/r_test_size_factors.csv"), index_col=0
+        os.path.join(test_path, "data/single_factor/r_test_size_factors.csv"),
+        index_col=0,
     ).squeeze()
 
     r_dispersions = pd.read_csv(
-        os.path.join(test_path, "data/r_test_dispersions.csv"), index_col=0
+        os.path.join(test_path, "data/single_factor/r_test_dispersions.csv"), index_col=0
     ).squeeze()
 
     dds = DeseqDataSet(counts_df, clinical_df, design_factor="condition")
@@ -79,14 +88,101 @@ def test_lfc_shrinkage(tol=0.02):
     res = DeseqStats(dds)
     res.summary()
     res.SE = r_res.lfcSE * np.log(2)
-    shrunk_res = res.lfc_shrink()
+    res.lfc_shrink()
+    shrunk_res = res.results_df
 
-    # Check that the same LFC and lfcSE are found (up to tol)
+    # Check that the same LFC are found (up to tol)
     assert (
         abs(r_shrunk_res.log2FoldChange - shrunk_res.log2FoldChange)
         / abs(r_shrunk_res.log2FoldChange)
     ).max() < tol
 
+
+# Multi-factor tests
+
+
+def test_multifactor_deseq(tol=0.02):
+    """Test that the outputs of the DESeq2 function match those of the original R
+    package, up to a tolerance in relative error.
+    """
+
+    test_path = str(Path(os.path.realpath(tests.__file__)).parent.resolve())
+
+    counts_df = pd.read_csv(
+        os.path.join(test_path, "data/multi_factor/test_counts.csv"), index_col=0
+    ).T
+    clinical_df = pd.read_csv(
+        os.path.join(test_path, "data/multi_factor/test_clinical.csv"), index_col=0
+    )
+
+    r_res = pd.read_csv(
+        os.path.join(test_path, "data/multi_factor/r_test_res.csv"), index_col=0
+    )
+
+    dds = DeseqDataSet(counts_df, clinical_df, design_factor=["group", "condition"])
+    dds.deseq2()
+
+    res = DeseqStats(dds)
+    res.summary()
+    res_df = res.results_df
+
+    # check that the same p-values are NaN
+    assert (res_df.pvalue.isna() == r_res.pvalue.isna()).all()
+    assert (res_df.padj.isna() == r_res.padj.isna()).all()
+
+    # Check that the same LFC, p-values and adjusted p-values are found (up to tol)
     assert (
-        abs(r_shrunk_res.lfcSE - shrunk_res.lfcSE) / abs(r_shrunk_res.lfcSE)
+        abs(r_res.log2FoldChange - res_df.log2FoldChange) / abs(r_res.log2FoldChange)
+    ).max() < tol
+    assert (abs(r_res.pvalue - res_df.pvalue) / r_res.pvalue).max() < tol
+    assert (abs(r_res.padj - res_df.padj) / r_res.padj).max() < tol
+
+
+def test_multifactor_lfc_shrinkage(tol=0.02):
+    """Test that the outputs of the lfc_shrink function match those of the original
+    R package (starting from the same inputs), up to a tolerance in relative error.
+    """
+
+    test_path = str(Path(os.path.realpath(tests.__file__)).parent.resolve())
+    r_res = pd.read_csv(
+        os.path.join(test_path, "data/multi_factor/r_test_res.csv"), index_col=0
+    )
+    r_shrunk_res = pd.read_csv(
+        os.path.join(test_path, "data/multi_factor/r_test_lfc_shrink_res.csv"),
+        index_col=0,
+    )
+
+    counts_df = pd.read_csv(
+        os.path.join(test_path, "data/multi_factor/test_counts.csv"), index_col=0
+    ).T
+
+    clinical_df = pd.read_csv(
+        os.path.join(test_path, "data/multi_factor/test_clinical.csv"), index_col=0
+    )
+
+    r_size_factors = pd.read_csv(
+        os.path.join(test_path, "data/multi_factor/r_test_size_factors.csv"),
+        index_col=0,
+    ).squeeze()
+
+    r_dispersions = pd.read_csv(
+        os.path.join(test_path, "data/multi_factor/r_test_dispersions.csv"), index_col=0
+    ).squeeze()
+
+    dds = DeseqDataSet(counts_df, clinical_df, design_factor=["group", "condition"])
+    dds.deseq2()
+    dds.size_factors = r_size_factors
+    dds.dispersions = r_dispersions
+    dds.LFCs.iloc[:, 1] = r_res.log2FoldChange.values * np.log(2)
+
+    res = DeseqStats(dds)
+    res.summary()
+    res.SE = r_res.lfcSE * np.log(2)
+    res.lfc_shrink()
+    shrunk_res = res.results_df
+
+    # Check that the same LFC found (up to tol)
+    assert (
+        abs(r_shrunk_res.log2FoldChange - shrunk_res.log2FoldChange)
+        / abs(r_shrunk_res.log2FoldChange)
     ).max() < tol

From 3b5f59d7824cdba8d3cb93a185174868e18edbf6 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Thu, 5 Jan 2023 15:47:53 +0100
Subject: [PATCH 14/25] docs: update docstrings and add line comments

---
 pydeseq2/DeseqDataSet.py | 11 +++++++++--
 pydeseq2/DeseqStats.py   |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py
index c6ec59b0..a612299a 100644
--- a/pydeseq2/DeseqDataSet.py
+++ b/pydeseq2/DeseqDataSet.py
@@ -46,7 +46,7 @@ class DeseqDataSet:
     design_factor : str or list[str]
         Name of the columns of clinical to be used as design variables. If a list,
         the last factor will be considered the variable of interest by default.
-        (default: 'high_grade').
+        Only bi-level factors are supported. (default: 'high_grade').
 
     reference_level : str
         The factor to use as a reference. Must be one of the values taken by the design.
@@ -182,14 +182,17 @@ def __init__(
             raise ValueError(
                 "The count matrix and clinical data should have the same index."
             )
+
+        # Convert design factor to list if a single string was provided.
         self.design_factor = (
             [design_factor] if isinstance(design_factor, str) else design_factor
         )
+
         if self.clinical[self.design_factor].isna().any().any():
             raise ValueError("NaNs are not allowed in the design factor.")
         self.clinical[self.design_factor] = self.clinical[self.design_factor].astype(str)
 
-        # Build the design matrix (splits the dataset in two cohorts)
+        # Build the design matrix (splits the dataset in cohorts)
         self.design_matrix = build_design_matrix(
             self.clinical,
             design_factor,
@@ -271,6 +274,10 @@ def fit_genewise_dispersions(self):
         rough_disps = self._rough_dispersions.loc[self.non_zero_genes].values
         size_factors = self.size_factors.values
 
+        # mu_hat is initialized differently depending on the number of different factor
+        # groups. If there are as many different factor combinations as design factors
+        # (intercept included), it is fitted with a linear model, otherwise it is fitted
+        # with a GLM (using rough dispersion estimates).
         if len(self.design_matrix.value_counts()) == p:
             with parallel_backend("loky", inner_max_num_threads=1):
                 mu_hat_ = np.array(
diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py
index f43431aa..0f8d0aba 100644
--- a/pydeseq2/DeseqStats.py
+++ b/pydeseq2/DeseqStats.py
@@ -123,7 +123,7 @@ def __init__(
 
         # Build contrast if None
         if contrast is not None:
-            # TODO : tests on the constrast
+            # TODO : tests on the contrast
             self.contrast = contrast
         else:
             factor = self.dds.design_factor[-1]

From 1963663e8cfe6398539966de3bf5d6cfc2249e2b Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Thu, 5 Jan 2023 15:51:26 +0100
Subject: [PATCH 15/25] refactor: change design_factor to design_factors

---
 notebooks/PyDESeq2_minimal_example.ipynb      |  2 +-
 .../PyDESeq2_step_by_step_pipeline.ipynb      |  2 +-
 pydeseq2/DeseqDataSet.py                      | 18 +++++++++--------
 pydeseq2/DeseqStats.py                        |  2 +-
 tests/test_edge_cases.py                      | 20 +++++++++----------
 tests/test_pydeseq2.py                        |  8 ++++----
 6 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/notebooks/PyDESeq2_minimal_example.ipynb b/notebooks/PyDESeq2_minimal_example.ipynb
index b3852938..fbb201d4 100644
--- a/notebooks/PyDESeq2_minimal_example.ipynb
+++ b/notebooks/PyDESeq2_minimal_example.ipynb
@@ -198,7 +198,7 @@
     "dds = DeseqDataSet(\n",
     "    counts_df,\n",
     "    clinical_df,\n",
-    "    design_factor=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n",
+    "    design_factors=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n",
     "    refit_cooks=True,\n",
     "    n_cpus=8,\n",
     ")"
diff --git a/notebooks/PyDESeq2_step_by_step_pipeline.ipynb b/notebooks/PyDESeq2_step_by_step_pipeline.ipynb
index 41243d1d..b338ab8b 100644
--- a/notebooks/PyDESeq2_step_by_step_pipeline.ipynb
+++ b/notebooks/PyDESeq2_step_by_step_pipeline.ipynb
@@ -196,7 +196,7 @@
     "dds = DeseqDataSet(\n",
     "    counts_df,\n",
     "    clinical_df,\n",
-    "    design_factor=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n",
+    "    design_factors=\"condition\" if CANCER == \"synthetic\" else \"high_grade\",\n",
     "    refit_cooks=True,\n",
     "    n_cpus=8,\n",
     ")"
diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py
index a612299a..77b86fe8 100644
--- a/pydeseq2/DeseqDataSet.py
+++ b/pydeseq2/DeseqDataSet.py
@@ -43,7 +43,7 @@ class DeseqDataSet:
         DataFrame containing clinical information.
         Must be indexed by sample barcodes.
 
-    design_factor : str or list[str]
+    design_factors : str or list[str]
         Name of the columns of clinical to be used as design variables. If a list,
         the last factor will be considered the variable of interest by default.
         Only bi-level factors are supported. (default: 'high_grade').
@@ -156,7 +156,7 @@ def __init__(
         self,
         counts,
         clinical,
-        design_factor="high_grade",
+        design_factors="high_grade",
         reference_level=None,
         min_mu=0.5,
         min_disp=1e-8,
@@ -184,18 +184,20 @@ def __init__(
             )
 
         # Convert design factor to list if a single string was provided.
-        self.design_factor = (
-            [design_factor] if isinstance(design_factor, str) else design_factor
+        self.design_factors = (
+            [design_factors] if isinstance(design_factors, str) else design_factors
         )
 
-        if self.clinical[self.design_factor].isna().any().any():
+        if self.clinical[self.design_factors].isna().any().any():
             raise ValueError("NaNs are not allowed in the design factor.")
-        self.clinical[self.design_factor] = self.clinical[self.design_factor].astype(str)
+        self.clinical[self.design_factors] = self.clinical[self.design_factors].astype(
+            str
+        )
 
         # Build the design matrix (splits the dataset in cohorts)
         self.design_matrix = build_design_matrix(
             self.clinical,
-            design_factor,
+            design_factors,
             ref=reference_level,
             expanded=False,
             intercept=True,
@@ -704,7 +706,7 @@ def _refit_without_outliers(
         sub_dds = DeseqDataSet(
             counts=self.counts_to_refit,
             clinical=self.clinical,
-            design_factor=self.design_factor,
+            design_factors=self.design_factors,
             min_mu=self.min_mu,
             min_disp=self.min_disp,
             max_disp=self.max_disp,
diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py
index 0f8d0aba..f0ba54ed 100644
--- a/pydeseq2/DeseqStats.py
+++ b/pydeseq2/DeseqStats.py
@@ -126,7 +126,7 @@ def __init__(
             # TODO : tests on the contrast
             self.contrast = contrast
         else:
-            factor = self.dds.design_factor[-1]
+            factor = self.dds.design_factors[-1]
             levels = np.unique(self.dds.clinical[factor]).astype(str)
             if "_".join([factor, levels[0]]) == self.dds.design_matrix.columns[-1]:
                 self.contrast = [factor, levels[0], levels[1]]
diff --git a/tests/test_edge_cases.py b/tests/test_edge_cases.py
index 4e85b9f3..d06bcd60 100644
--- a/tests/test_edge_cases.py
+++ b/tests/test_edge_cases.py
@@ -32,7 +32,7 @@ def test_zero_genes():
     counts_df[zero_genes] = 0
 
     # Run analysis
-    dds = DeseqDataSet(counts_df, clinical_df, design_factor="condition")
+    dds = DeseqDataSet(counts_df, clinical_df, design_factors="condition")
     dds.deseq2()
 
     # check that the corresponding parameters are NaN
@@ -59,7 +59,7 @@ def test_nan_counts():
     clinical_df = pd.DataFrame({"condition": [0, 1]})
 
     with pytest.raises(ValueError):
-        DeseqDataSet(counts_df, clinical_df, design_factor="condition")
+        DeseqDataSet(counts_df, clinical_df, design_factors="condition")
 
 
 def test_numeric_counts():
@@ -70,7 +70,7 @@ def test_numeric_counts():
     clinical_df = pd.DataFrame({"condition": [0, 1]})
 
     with pytest.raises(ValueError):
-        DeseqDataSet(counts_df, clinical_df, design_factor="condition")
+        DeseqDataSet(counts_df, clinical_df, design_factors="condition")
 
 
 def test_integer_counts():
@@ -80,7 +80,7 @@ def test_integer_counts():
     clinical_df = pd.DataFrame({"condition": [0, 1]})
 
     with pytest.raises(ValueError):
-        DeseqDataSet(counts_df, clinical_df, design_factor="condition")
+        DeseqDataSet(counts_df, clinical_df, design_factors="condition")
 
 
 def test_non_negative_counts():
@@ -90,7 +90,7 @@ def test_non_negative_counts():
     clinical_df = pd.DataFrame({"condition": [0, 1]})
 
     with pytest.raises(ValueError):
-        DeseqDataSet(counts_df, clinical_df, design_factor="condition")
+        DeseqDataSet(counts_df, clinical_df, design_factors="condition")
 
 
 # Tests on the clinical data (design factors)
@@ -100,7 +100,7 @@ def test_nan_factors():
     clinical_df = pd.DataFrame({"condition": [0, np.NaN]})
 
     with pytest.raises(ValueError):
-        DeseqDataSet(counts_df, clinical_df, design_factor="condition")
+        DeseqDataSet(counts_df, clinical_df, design_factors="condition")
 
 
 def test_one_factors():
@@ -109,7 +109,7 @@ def test_one_factors():
     clinical_df = pd.DataFrame({"condition": [0, 0]})
 
     with pytest.raises(ValueError):
-        DeseqDataSet(counts_df, clinical_df, design_factor="condition")
+        DeseqDataSet(counts_df, clinical_df, design_factors="condition")
 
 
 def test_too_many_factors():
@@ -119,7 +119,7 @@ def test_too_many_factors():
     clinical_df = pd.DataFrame({"condition": [0, 1, 2]})
 
     with pytest.raises(ValueError):
-        DeseqDataSet(counts_df, clinical_df, design_factor="condition")
+        DeseqDataSet(counts_df, clinical_df, design_factors="condition")
 
 
 def test_reference_level():
@@ -130,7 +130,7 @@ def test_reference_level():
 
     with pytest.raises(KeyError):
         DeseqDataSet(
-            counts_df, clinical_df, design_factor="condition", reference_level="control"
+            counts_df, clinical_df, design_factors="condition", reference_level="control"
         )
 
 
@@ -146,5 +146,5 @@ def test_indexes():
         DeseqDataSet(
             counts_df,
             clinical_df,
-            design_factor="condition",
+            design_factors="condition",
         )
diff --git a/tests/test_pydeseq2.py b/tests/test_pydeseq2.py
index 08a161e1..c6e4cdef 100644
--- a/tests/test_pydeseq2.py
+++ b/tests/test_pydeseq2.py
@@ -29,7 +29,7 @@ def test_deseq(tol=0.02):
         os.path.join(test_path, "data/single_factor/r_test_res.csv"), index_col=0
     )
 
-    dds = DeseqDataSet(counts_df, clinical_df, design_factor="condition")
+    dds = DeseqDataSet(counts_df, clinical_df, design_factors="condition")
     dds.deseq2()
 
     res = DeseqStats(dds)
@@ -79,7 +79,7 @@ def test_lfc_shrinkage(tol=0.02):
         os.path.join(test_path, "data/single_factor/r_test_dispersions.csv"), index_col=0
     ).squeeze()
 
-    dds = DeseqDataSet(counts_df, clinical_df, design_factor="condition")
+    dds = DeseqDataSet(counts_df, clinical_df, design_factors="condition")
     dds.deseq2()
     dds.size_factors = r_size_factors
     dds.dispersions = r_dispersions
@@ -119,7 +119,7 @@ def test_multifactor_deseq(tol=0.02):
         os.path.join(test_path, "data/multi_factor/r_test_res.csv"), index_col=0
     )
 
-    dds = DeseqDataSet(counts_df, clinical_df, design_factor=["group", "condition"])
+    dds = DeseqDataSet(counts_df, clinical_df, design_factors=["group", "condition"])
     dds.deseq2()
 
     res = DeseqStats(dds)
@@ -169,7 +169,7 @@ def test_multifactor_lfc_shrinkage(tol=0.02):
         os.path.join(test_path, "data/multi_factor/r_test_dispersions.csv"), index_col=0
     ).squeeze()
 
-    dds = DeseqDataSet(counts_df, clinical_df, design_factor=["group", "condition"])
+    dds = DeseqDataSet(counts_df, clinical_df, design_factors=["group", "condition"])
     dds.deseq2()
     dds.size_factors = r_size_factors
     dds.dispersions = r_dispersions

From ae19ca27ca0b589645ef5971028d8d8f461103b8 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Thu, 5 Jan 2023 15:52:43 +0100
Subject: [PATCH 16/25] docs: update README

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b92cb484..c7b27074 100644
--- a/README.md
+++ b/README.md
@@ -24,8 +24,9 @@ It aims to facilitate DEA experiments for python users.
 
 As PyDESeq2 is a re-implementation of [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) from scratch, you may experience some differences in terms of retrieved values or available features.
 
-Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor analysis,
-but we plan to implement more in the near future. In case there is a feature you would particularly like to be implemented, feel free to open an issue.
+Currently, available features broadly correspond to the default settings of DESeq2 (v1.34.0) for single-factor 
+and paired multi-factor analysis, but we plan to implement more in the near future.
+In case there is a feature you would particularly like to be implemented, feel free to open an issue.
 
 ## Installation
 

From a7dc5fe063839de0f4efd68cd999ec674517f4ae Mon Sep 17 00:00:00 2001
From: Boris Muzellec <BorisMuzellec@users.noreply.github.com>
Date: Mon, 9 Jan 2023 10:24:13 +0100
Subject: [PATCH 17/25] docs: update docstring (list of strings argument)

Co-authored-by: Maria Telenczuk <maria.telenczuk@owkin.com>
---
 pydeseq2/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
index ed56c46e..a9f1ab27 100644
--- a/pydeseq2/utils.py
+++ b/pydeseq2/utils.py
@@ -152,7 +152,7 @@ def build_design_matrix(
         DataFrame containing clinical information.
         Must be indexed by sample barcodes, and contain a "high_grade" column.
 
-    design : str or list
+    design : str or list[str]
         Name of the columns of clinical_df to be used as design_matrix variables.
         (default: "high_grade").
 

From e5364518043f4edf9dccaf314ad8020bc7ba2b87 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 9 Jan 2023 10:51:46 +0100
Subject: [PATCH 18/25] refactor: change single-letter variable names to more
 informative ones

---
 pydeseq2/DeseqDataSet.py | 34 +++++++++++++--------------
 pydeseq2/DeseqStats.py   | 24 ++++++++++---------
 pydeseq2/utils.py        | 51 +++++++++++++++++++++-------------------
 3 files changed, 57 insertions(+), 52 deletions(-)

diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py
index 77b86fe8..4e4d0038 100644
--- a/pydeseq2/DeseqDataSet.py
+++ b/pydeseq2/DeseqDataSet.py
@@ -256,7 +256,7 @@ def fit_genewise_dispersions(self):
         Fits a negative binomial per gene, independently.
         """
 
-        p = self.design_matrix.shape[-1]
+        num_vars = self.design_matrix.shape[-1]
 
         # Check that size factors are available. If not, compute them.
         if not hasattr(self, "size_factors"):
@@ -269,7 +269,7 @@ def fit_genewise_dispersions(self):
         non_zero = ~(self.counts == 0).all()
         self.non_zero_genes = non_zero[non_zero].index
         counts_nonzero = self.counts.loc[:, self.non_zero_genes].values
-        m = counts_nonzero.shape[1]
+        num_genes = counts_nonzero.shape[1]
 
         # Convert design_matrix to numpy for speed
         X = self.design_matrix.values
@@ -280,7 +280,7 @@ def fit_genewise_dispersions(self):
         # groups. If there are as many different factor combinations as design factors
         # (intercept included), it is fitted with a linear model, otherwise it is fitted
         # with a GLM (using rough dispersion estimates).
-        if len(self.design_matrix.value_counts()) == p:
+        if len(self.design_matrix.value_counts()) == num_vars:
             with parallel_backend("loky", inner_max_num_threads=1):
                 mu_hat_ = np.array(
                     Parallel(
@@ -294,7 +294,7 @@ def fit_genewise_dispersions(self):
                             X,
                             self.min_mu,
                         )
-                        for i in range(m)
+                        for i in range(num_genes)
                     )
                 )
         else:
@@ -312,7 +312,7 @@ def fit_genewise_dispersions(self):
                         self.min_mu,
                         self.beta_tol,
                     )
-                    for i in range(m)
+                    for i in range(num_genes)
                 )
 
                 _, mu_hat_, _, _ = zip(*res)
@@ -336,7 +336,7 @@ def fit_genewise_dispersions(self):
                     self.min_disp,
                     self.max_disp,
                 )
-                for i in range(m)
+                for i in range(num_genes)
             )
         end = time.time()
         print(f"... done in {end - start:.2f} seconds.\n")
@@ -431,8 +431,8 @@ def fit_dispersion_prior(self):
             self.fit_dispersion_trend()
 
         # Exclude genes with all zeroes
-        m = len(self.non_zero_genes)
-        p = self.design_matrix.shape[1]
+        num_genes = len(self.non_zero_genes)
+        num_vars = self.design_matrix.shape[1]
 
         # Fit dispersions to the curve, and compute log residuals
         disp_residuals = np.log(self.genewise_dispersions) - np.log(
@@ -448,7 +448,7 @@ def fit_dispersion_prior(self):
             disp_residuals.loc[self.non_zero_genes][above_min_disp]
         ).median() ** 2 / norm.ppf(0.75)
         self.prior_disp_var = np.maximum(
-            self._squared_logres - polygamma(1, (m - p) / 2), 0.25
+            self._squared_logres - polygamma(1, (num_genes - num_vars) / 2), 0.25
         )
 
     def fit_MAP_dispersions(self):
@@ -463,7 +463,7 @@ def fit_MAP_dispersions(self):
 
         # Exclude genes with all zeroes
         counts_nonzero = self.counts.loc[:, self.non_zero_genes].values
-        m = counts_nonzero.shape[1]
+        num_genes = counts_nonzero.shape[1]
 
         X = self.design_matrix.values
         mu_hat = self._mu_hat.values
@@ -488,7 +488,7 @@ def fit_MAP_dispersions(self):
                     True,
                     True,
                 )
-                for i in range(m)
+                for i in range(num_genes)
             )
         end = time.time()
         print(f"... done in {end-start:.2f} seconds.\n")
@@ -524,7 +524,7 @@ def fit_LFC(self):
 
         # Exclude genes with all zeroes
         counts_nonzero = self.counts.loc[:, self.non_zero_genes].values
-        m = counts_nonzero.shape[1]
+        num_genes = counts_nonzero.shape[1]
 
         X = self.design_matrix.values
         size_factors = self.size_factors.values
@@ -546,7 +546,7 @@ def fit_LFC(self):
                     self.min_mu,
                     self.beta_tol,
                 )
-                for i in range(m)
+                for i in range(num_genes)
             )
         end = time.time()
         print(f"... done in {end-start:.2f} seconds.\n")
@@ -593,7 +593,7 @@ def calculate_cooks(self):
         if not hasattr(self, "dispersions"):
             self.fit_MAP_dispersions()
 
-        p = self.design_matrix.shape[1]
+        num_vars = self.design_matrix.shape[1]
         # Keep only non-zero genes
         normed_counts = self.counts.loc[:, self.non_zero_genes].div(self.size_factors, 0)
         dispersions = robust_method_of_moments_disp(normed_counts, self.design_matrix)
@@ -603,7 +603,7 @@ def calculate_cooks(self):
         ) ** 2 / V
         self.cooks = (
             squared_pearson_res
-            / p
+            / num_vars
             * (self._hat_diagonals / (1 - self._hat_diagonals) ** 2)
         ).T
 
@@ -644,7 +644,7 @@ def _replace_outliers(self):
         if not hasattr(self, "cooks"):
             self.calculate_cooks()
 
-        m, p = self.design_matrix.shape
+        num_samples, num_vars = self.design_matrix.shape
 
         # Check whether cohorts have enough samples to allow refitting
         n_or_more = (
@@ -657,7 +657,7 @@ def _replace_outliers(self):
         self.replaceable.index = self.design_matrix.index
 
         # Get positions of counts with cooks above threshold
-        cooks_cutoff = f.ppf(0.99, p, m - p)
+        cooks_cutoff = f.ppf(0.99, num_vars, num_samples - num_vars)
         idx = (self.cooks > cooks_cutoff).T
         self.replaced = idx.any(axis=0)
 
diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py
index f0ba54ed..34b5392f 100644
--- a/pydeseq2/DeseqStats.py
+++ b/pydeseq2/DeseqStats.py
@@ -192,8 +192,8 @@ def run_wald_test(self):
         Get gene-wise p-values for gene over/under-expression.`
         """
 
-        n = len(self.LFCs)
-        p = self.dds.design_matrix.shape[1]
+        num_genes = len(self.LFCs)
+        num_vars = self.dds.design_matrix.shape[1]
 
         # Raise a warning if LFCs are shrunk.
         if self.shrunk_LFCs:
@@ -230,9 +230,9 @@ def run_wald_test(self):
 
         # Set regularization factors.
         if self.prior_disp_var is not None:
-            D = np.diag(1 / self.prior_disp_var**2)
+            ridge_factor = np.diag(1 / self.prior_disp_var**2)
         else:
-            D = np.diag(np.repeat(1e-6, p))
+            ridge_factor = np.diag(np.repeat(1e-6, num_vars))
 
         X = self.dds.design_matrix.values
         disps = self.dds.dispersions.values
@@ -246,8 +246,10 @@ def run_wald_test(self):
                 verbose=self.joblib_verbosity,
                 batch_size=self.batch_size,
             )(
-                delayed(wald_test)(X, disps[i], LFCs[i], mu[:, i], D, self.contrast_idx)
-                for i in range(n)
+                delayed(wald_test)(
+                    X, disps[i], LFCs[i], mu[:, i], ridge_factor, self.contrast_idx
+                )
+                for i in range(num_genes)
             )
         end = time.time()
         print(f"... done in {end-start:.2f} seconds.\n")
@@ -281,7 +283,7 @@ def lfc_shrink(self):
         # Filter genes with all zero counts
         nonzero = self.dds.counts.sum(0) > 0
         counts_nonzero = self.dds.counts.loc[:, nonzero].values
-        m = counts_nonzero.shape[1]
+        num_genes = counts_nonzero.shape[1]
 
         size = (1.0 / self.dds.dispersions[nonzero]).values
         offset = np.log(self.dds.size_factors).values
@@ -311,7 +313,7 @@ def lfc_shrink(self):
                     "L-BFGS-B",
                     self.contrast_idx,
                 )
-                for i in range(m)
+                for i in range(num_genes)
             )
         end = time.time()
         print(f"... done in {end-start:.2f} seconds.\n")
@@ -425,13 +427,13 @@ def _cooks_filtering(self):
         if not hasattr(self, "p_values"):
             self.run_wald_test()
 
-        m, p = self.dds.design_matrix.shape
-        cooks_cutoff = f.ppf(0.99, p, m - p)
+        num_samples, num_vars = self.dds.design_matrix.shape
+        cooks_cutoff = f.ppf(0.99, num_vars, num_samples - num_vars)
 
         # If for a gene there are 3 samples or more that have more counts than the
         # maximum cooks sample, don't count this gene as an outlier.
         # Do this only if there are 2 cohorts.
-        if p == 2:
+        if num_vars == 2:
             # Check whether cohorts have enough samples to allow refitting
             # Only consider conditions with 3 or more samples (same as in R)
             n_or_more = (
diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
index a9f1ab27..25feee73 100644
--- a/pydeseq2/utils.py
+++ b/pydeseq2/utils.py
@@ -372,7 +372,7 @@ def irls_solver(
 
     assert optimizer in ["BFGS", "L-BFGS-B"]
 
-    p = design_matrix.shape[1]
+    num_vars = design_matrix.shape[1]
 
     # converting to numpy for speed
     if type(counts) == pd.Series:
@@ -385,7 +385,7 @@ def irls_solver(
         X = design_matrix
 
     # if full rank, estimate initial betas for IRLS below
-    if np.linalg.matrix_rank(X) == p:
+    if np.linalg.matrix_rank(X) == num_vars:
         Q, R = np.linalg.qr(X)
         y = np.log(counts / size_factors + 0.1)
         beta_init = solve(R, Q.T @ y)
@@ -394,7 +394,7 @@ def irls_solver(
     dev = 1000
     dev_ratio = 1.0
 
-    D = np.diag(np.repeat(1e-6, p))
+    ridge_factor = np.diag(np.repeat(1e-6, num_vars))
     mu = np.maximum(size_factors * np.exp(X @ beta), min_mu)
 
     converged = True
@@ -403,7 +403,7 @@ def irls_solver(
 
         W = mu / (1.0 + mu * disp)
         z = np.log(mu / size_factors) + (counts - mu) / mu
-        H = (X.T * W) @ X + D
+        H = (X.T * W) @ X + ridge_factor
         beta_hat = solve(H, X.T @ (W * z), assume_a="pos")
         i += 1
 
@@ -412,14 +412,14 @@ def irls_solver(
             def f(beta):
                 # closure to minimize
                 mu_ = np.maximum(size_factors * np.exp(X @ beta), min_mu)
-                return nb_nll(counts, mu_, disp) + 0.5 * (D @ beta**2).sum()
+                return nb_nll(counts, mu_, disp) + 0.5 * (ridge_factor @ beta**2).sum()
 
             def df(beta):
                 mu_ = np.maximum(size_factors * np.exp(X @ beta), min_mu)
                 return (
                     -X.T @ counts
                     + ((1 / disp + counts) * mu_ / (1 / disp + mu_)) @ X
-                    + D @ beta
+                    + ridge_factor @ beta
                 )
 
             res = minimize(
@@ -427,14 +427,16 @@ def df(beta):
                 beta_init,
                 jac=df,
                 method=optimizer,
-                bounds=[(min_beta, max_beta)] * p if optimizer == "L-BFGS-B" else None,
+                bounds=[(min_beta, max_beta)] * num_vars
+                if optimizer == "L-BFGS-B"
+                else None,
             )
 
             beta = res.x
             mu = np.maximum(size_factors * np.exp(X @ beta), min_mu)
             converged = res.success
 
-            if not res.success and p <= 2:
+            if not res.success and num_vars <= 2:
                 beta = grid_fit_beta(
                     counts,
                     size_factors,
@@ -455,7 +457,7 @@ def df(beta):
     # Compute H diagonal (useful for Cook distance outlier filtering)
     W = mu / (1.0 + mu * disp)
     W_sq = np.sqrt(W)
-    XtWX = (X.T * W) @ X + D
+    XtWX = (X.T * W) @ X + ridge_factor
     H = W_sq * np.diag(X @ np.linalg.inv(XtWX) @ X.T) * W_sq
     # Return an UNthresholded mu (as in the R code)
     # Previous quantities are estimated with a threshold though
@@ -683,7 +685,6 @@ def trimmed_variance(x, trim=0.125, axis=1):
     return 1.51 * trimmed_mean(sqerror, trim=trim, axis=axis)
 
 
-# TODO : bug here in multi-factor?
 def fit_lin_mu(y, size_factors, X, min_mu=0.5):
     """Estimate mean of negative binomial model using a linear regression.
 
@@ -716,7 +717,7 @@ def fit_lin_mu(y, size_factors, X, min_mu=0.5):
     return np.maximum(mu_hat, min_mu)
 
 
-def wald_test(X, disp, lfc, mu, D, idx=-1):
+def wald_test(X, disp, lfc, mu, ridge_factor, idx=-1):
     """Run Wald test for differential expression.
 
     Computes Wald statistics, standard error and p-values from
@@ -736,7 +737,7 @@ def wald_test(X, disp, lfc, mu, D, idx=-1):
     mu : float
         Mean estimation for the NB model.
 
-    D : ndarray
+    ridge_factor : ndarray
         Regularization factors.
 
     idx : int
@@ -757,7 +758,7 @@ def wald_test(X, disp, lfc, mu, D, idx=-1):
     # Build covariance matrix estimator
     W = np.diag(mu / (1 + mu * disp))
     M = X.T @ W @ X
-    H = np.linalg.inv(M + D)
+    H = np.linalg.inv(M + ridge_factor)
     S = H @ M @ H
     # Evaluate standard error and Wald statistic
     wald_se = np.sqrt(S[idx, idx])
@@ -789,7 +790,7 @@ def fit_rough_dispersions(counts, size_factors, design_matrix):
         Estimated dispersion parameter for each gene.
     """
 
-    m, p = design_matrix.shape
+    num_samples, num_vars = design_matrix.shape
     normed_counts = counts.div(size_factors, 0)
     # Exclude genes with all zeroes
     normed_counts = normed_counts.loc[:, ~(normed_counts == 0).all()]
@@ -797,7 +798,9 @@ def fit_rough_dispersions(counts, size_factors, design_matrix):
     reg.fit(design_matrix, normed_counts)
     y_hat = reg.predict(design_matrix)
     y_hat = np.maximum(y_hat, 1)
-    alpha_rde = (((normed_counts - y_hat) ** 2 - y_hat) / ((m - p) * y_hat**2)).sum(0)
+    alpha_rde = (
+        ((normed_counts - y_hat) ** 2 - y_hat) / ((num_samples - num_vars) * y_hat**2)
+    ).sum(0)
     return np.maximum(alpha_rde, 0)
 
 
@@ -949,17 +952,17 @@ def nbinomGLM(
         Whether L-BFGS-B converged.
     """
 
-    p = design_matrix.shape[-1]
+    num_vars = design_matrix.shape[-1]
 
-    shrink_mask = np.zeros(p)
+    shrink_mask = np.zeros(num_vars)
     shrink_mask[shrink_index] = 1
-    no_shrink_mask = np.ones(p) - shrink_mask
+    no_shrink_mask = np.ones(num_vars) - shrink_mask
 
-    beta_init = np.ones(p) * 0.1 * (-1) ** (np.arange(p))
+    beta_init = np.ones(num_vars) * 0.1 * (-1) ** (np.arange(num_vars))
 
     # Set optimization scale
     scale_cnst = nbinomFn(
-        np.zeros(p),
+        np.zeros(num_vars),
         design_matrix,
         counts,
         size,
@@ -1029,7 +1032,7 @@ def ddf(beta, cnst=scale_cnst):
     beta = res.x
     converged = res.success
 
-    if not converged and p == 2:
+    if not converged and num_vars == 2:
         # If the solver failed, fit using grid search (slow)
         # Only for single-factor analysis
         beta = grid_fit_shrink_beta(
@@ -1096,11 +1099,11 @@ def nbinomFn(
         Sum of the NB negative likelihood and apeGLM prior.
     """
 
-    p = design_matrix.shape[-1]
+    num_vars = design_matrix.shape[-1]
 
-    shrink_mask = np.zeros(p)
+    shrink_mask = np.zeros(num_vars)
     shrink_mask[shrink_index] = 1
-    no_shrink_mask = np.ones(p) - shrink_mask
+    no_shrink_mask = np.ones(num_vars) - shrink_mask
 
     xbeta = design_matrix @ beta
     prior = (

From cd85df801d768c18abe41198d1ba71d80793c606 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 9 Jan 2023 11:13:44 +0100
Subject: [PATCH 19/25] fix: change attribute name from design_factor to
 design_factors everywhere

---
 pydeseq2/DeseqDataSet.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py
index b279ba13..b7fa113f 100644
--- a/pydeseq2/DeseqDataSet.py
+++ b/pydeseq2/DeseqDataSet.py
@@ -188,9 +188,11 @@ def __init__(
         self.design_factors = (
             [design_factors] if isinstance(design_factors, str) else design_factors
         )
-        if self.clinical[self.design_factor].isna().any():
+        if self.clinical[self.design_factors].isna().any():
             raise ValueError("NaNs are not allowed in the design factor.")
-        self.clinical[self.design_factor] = self.clinical[self.design_factor].astype(str)
+        self.clinical[self.design_factors] = self.clinical[self.design_factors].astype(
+            str
+        )
 
         # Build the design matrix (splits the dataset in cohorts)
         self.design_matrix = build_design_matrix(

From a2b39cc305cf67c2bea518ccc5e6adcfaf023530 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 9 Jan 2023 11:19:37 +0100
Subject: [PATCH 20/25] fix: fix design factor test

---
 pydeseq2/DeseqDataSet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py
index b7fa113f..1d26324e 100644
--- a/pydeseq2/DeseqDataSet.py
+++ b/pydeseq2/DeseqDataSet.py
@@ -188,7 +188,7 @@ def __init__(
         self.design_factors = (
             [design_factors] if isinstance(design_factors, str) else design_factors
         )
-        if self.clinical[self.design_factors].isna().any():
+        if self.clinical[self.design_factors].isna().any().any():
             raise ValueError("NaNs are not allowed in the design factor.")
         self.clinical[self.design_factors] = self.clinical[self.design_factors].astype(
             str

From 8c2197a349d45c5a089b23981c58e0f270c3c66a Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 9 Jan 2023 11:52:32 +0100
Subject: [PATCH 21/25] docs: expand the description of the "contrast"
 attribute

---
 pydeseq2/DeseqStats.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py
index 34b5392f..9e07f010 100644
--- a/pydeseq2/DeseqStats.py
+++ b/pydeseq2/DeseqStats.py
@@ -33,6 +33,9 @@ class DeseqStats:
         A list of three strings, in the following format:
         ['variable_of_interest', 'tested_level', 'reference_level'].
         Names must correspond to the clinical data passed to the DeseqDataSet.
+        E.g., ['condition', 'B', 'A'] will measure the LFC of 'condition B' compared to
+        'condition A'. If None, the last variable from the design matrix is chosen
+        as the variable of interest, and the reference level is picked alphabetically.
         (default: None).
 
     alpha : float

From 39f4211014399175ce08cc18e24141ffd5f08936 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 9 Jan 2023 12:00:14 +0100
Subject: [PATCH 22/25] feat: add tests on the contrast variable

---
 pydeseq2/DeseqStats.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py
index 9e07f010..67021ed5 100644
--- a/pydeseq2/DeseqStats.py
+++ b/pydeseq2/DeseqStats.py
@@ -124,11 +124,16 @@ def __init__(
 
         self.dds = dds
 
-        # Build contrast if None
-        if contrast is not None:
-            # TODO : tests on the contrast
+        if contrast is not None:  # Test contrast if provided
+            assert (
+                contrast[0] in self.dds.design_factors
+            ), "The contrast variable should be one of the design factors."
+            assert (
+                contrast[1] in self.dds.clinical[contrast[0]].values
+                and contrast[2] in self.dds.clinical[contrast[0]].values
+            ), "The contrast levels should correspond to design factors levels."
             self.contrast = contrast
-        else:
+        else:  # Build contrast if None
             factor = self.dds.design_factors[-1]
             levels = np.unique(self.dds.clinical[factor]).astype(str)
             if "_".join([factor, levels[0]]) == self.dds.design_matrix.columns[-1]:

From b0e35f50c69f4c625a67896f4a9837972fc59fe9 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 9 Jan 2023 13:19:06 +0100
Subject: [PATCH 23/25] feat: add tests on the contrast variable

---
 pydeseq2/DeseqStats.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py
index 67021ed5..c688f171 100644
--- a/pydeseq2/DeseqStats.py
+++ b/pydeseq2/DeseqStats.py
@@ -125,6 +125,7 @@ def __init__(
         self.dds = dds
 
         if contrast is not None:  # Test contrast if provided
+            assert len(contrast) == 3, "The contrast should contain three strings."
             assert (
                 contrast[0] in self.dds.design_factors
             ), "The contrast variable should be one of the design factors."

From 3c0ded613d38aacac433983c9871e3f3210c37b5 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 9 Jan 2023 13:46:49 +0100
Subject: [PATCH 24/25] refactor: change variable names for more meaningful
 ones and give variable names when calling functions

---
 pydeseq2/DeseqDataSet.py | 62 +++++++++++++++++-----------------
 pydeseq2/DeseqStats.py   | 23 ++++++++-----
 pydeseq2/grid_search.py  | 41 +++++++++++++----------
 pydeseq2/utils.py        | 72 +++++++++++++++++++++++-----------------
 4 files changed, 110 insertions(+), 88 deletions(-)

diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py
index 1d26324e..5e2cfed8 100644
--- a/pydeseq2/DeseqDataSet.py
+++ b/pydeseq2/DeseqDataSet.py
@@ -289,10 +289,10 @@ def fit_genewise_dispersions(self):
                         batch_size=self.batch_size,
                     )(
                         delayed(fit_lin_mu)(
-                            counts_nonzero[:, i],
-                            size_factors,
-                            X,
-                            self.min_mu,
+                            counts=counts_nonzero[:, i],
+                            size_factors=size_factors,
+                            design_matrix=X,
+                            min_mu=self.min_mu,
                         )
                         for i in range(num_genes)
                     )
@@ -305,12 +305,12 @@ def fit_genewise_dispersions(self):
                     batch_size=self.batch_size,
                 )(
                     delayed(irls_solver)(
-                        counts_nonzero[:, i],
-                        size_factors,
-                        X,
-                        rough_disps[i],
-                        self.min_mu,
-                        self.beta_tol,
+                        counts=counts_nonzero[:, i],
+                        size_factors=size_factors,
+                        design_matrix=X,
+                        disp=rough_disps[i],
+                        min_mu=self.min_mu,
+                        beta_tol=self.beta_tol,
                     )
                     for i in range(num_genes)
                 )
@@ -329,12 +329,12 @@ def fit_genewise_dispersions(self):
         with parallel_backend("loky", inner_max_num_threads=1):
             res = Parallel(n_jobs=self.n_processes, batch_size=self.batch_size)(
                 delayed(fit_alpha_mle)(
-                    counts_nonzero[:, i],
-                    X,
-                    mu_hat_[i, :],
-                    rough_disps[i],
-                    self.min_disp,
-                    self.max_disp,
+                    counts=counts_nonzero[:, i],
+                    design_matrix=X,
+                    mu=mu_hat_[i, :],
+                    alpha_hat=rough_disps[i],
+                    min_disp=self.min_disp,
+                    max_disp=self.max_disp,
                 )
                 for i in range(num_genes)
             )
@@ -478,15 +478,15 @@ def fit_MAP_dispersions(self):
                 batch_size=self.batch_size,
             )(
                 delayed(fit_alpha_mle)(
-                    counts_nonzero[:, i],
-                    X,
-                    mu_hat[:, i],
-                    fit_disps[i],
-                    self.min_disp,
-                    self.max_disp,
-                    self.prior_disp_var,
-                    True,
-                    True,
+                    counts=counts_nonzero[:, i],
+                    design_matrix=X,
+                    mu=mu_hat[:, i],
+                    alpha_hat=fit_disps[i],
+                    min_disp=self.min_disp,
+                    max_disp=self.max_disp,
+                    prior_disp_var=self.prior_disp_var,
+                    cr_reg=True,
+                    prior_reg=True,
                 )
                 for i in range(num_genes)
             )
@@ -539,12 +539,12 @@ def fit_LFC(self):
                 batch_size=self.batch_size,
             )(
                 delayed(irls_solver)(
-                    counts_nonzero[:, i],
-                    size_factors,
-                    X,
-                    disps[i],
-                    self.min_mu,
-                    self.beta_tol,
+                    counts=counts_nonzero[:, i],
+                    size_factors=size_factors,
+                    design_matrix=X,
+                    disp=disps[i],
+                    min_mu=self.min_mu,
+                    beta_tol=self.beta_tol,
                 )
                 for i in range(num_genes)
             )
diff --git a/pydeseq2/DeseqStats.py b/pydeseq2/DeseqStats.py
index c688f171..2100beb8 100644
--- a/pydeseq2/DeseqStats.py
+++ b/pydeseq2/DeseqStats.py
@@ -256,7 +256,12 @@ def run_wald_test(self):
                 batch_size=self.batch_size,
             )(
                 delayed(wald_test)(
-                    X, disps[i], LFCs[i], mu[:, i], ridge_factor, self.contrast_idx
+                    design_matrix=X,
+                    disp=disps[i],
+                    lfc=LFCs[i],
+                    mu=mu[:, i],
+                    ridge_factor=ridge_factor,
+                    idx=self.contrast_idx,
                 )
                 for i in range(num_genes)
             )
@@ -313,14 +318,14 @@ def lfc_shrink(self):
                 batch_size=self.batch_size,
             )(
                 delayed(nbinomGLM)(
-                    X,
-                    counts_nonzero[:, i],
-                    size[i],
-                    offset,
-                    prior_no_shrink_scale,
-                    prior_scale,
-                    "L-BFGS-B",
-                    self.contrast_idx,
+                    design_matrix=X,
+                    counts=counts_nonzero[:, i],
+                    size=size[i],
+                    offset=offset,
+                    prior_no_shrink_scale=prior_no_shrink_scale,
+                    prior_scale=prior_scale,
+                    optimizer="L-BFGS-B",
+                    shrink_index=self.contrast_idx,
                 )
                 for i in range(num_genes)
             )
diff --git a/pydeseq2/grid_search.py b/pydeseq2/grid_search.py
index abb048a6..efac147f 100644
--- a/pydeseq2/grid_search.py
+++ b/pydeseq2/grid_search.py
@@ -4,14 +4,14 @@
 import pydeseq2.utils
 
 
-def vec_nb_nll(y, mu, alpha):
+def vec_nb_nll(counts, mu, alpha):
     """Return the negative log-likelihood of a negative binomial.
 
     Vectorized version.
 
     Parameters
     ----------
-    y : ndarray
+    counts : ndarray
         Observations.
 
     mu : float
@@ -24,33 +24,35 @@ def vec_nb_nll(y, mu, alpha):
     Returns
     -------
     float
-        Negative log likelihood of the observations y following
+        Negative log likelihood of the observations counts following
         :math:`NB(\\mu, \\alpha)`.
     """
 
-    n = len(y)
+    n = len(counts)
     alpha_neg1 = 1 / alpha
     logbinom = (
-        gammaln(y[:, None] + alpha_neg1) - gammaln(y + 1)[:, None] - gammaln(alpha_neg1)
+        gammaln(counts[:, None] + alpha_neg1)
+        - gammaln(counts + 1)[:, None]
+        - gammaln(alpha_neg1)
     )
 
     if len(mu.shape) == 1:
         return n * alpha_neg1 * np.log(alpha) + (
             -logbinom
-            + (y[:, None] + alpha_neg1) * np.log(mu[:, None] + alpha_neg1)
-            - (y * np.log(mu))[:, None]
+            + (counts[:, None] + alpha_neg1) * np.log(mu[:, None] + alpha_neg1)
+            - (counts * np.log(mu))[:, None]
         ).sum(0)
     else:
         return n * alpha_neg1 * np.log(alpha) + (
             -logbinom
-            + (y[:, None] + alpha_neg1) * np.log(mu + alpha_neg1)
-            - (y[:, None] * np.log(mu))
+            + (counts[:, None] + alpha_neg1) * np.log(mu + alpha_neg1)
+            - (counts[:, None] * np.log(mu))
         ).sum(0)
 
 
 def grid_fit_alpha(
-    y,
-    X,
+    counts,
+    design_matrix,
     mu,
     alpha_hat,
     min_disp,
@@ -66,10 +68,10 @@ def grid_fit_alpha(
 
     Parameters
     ----------
-    y : ndarray
+    counts : ndarray
         Raw counts for a given gene.
 
-    X : ndarray
+    design_matrix : ndarray
         Design matrix.
 
     mu : ndarray
@@ -113,11 +115,14 @@ def loss(log_alpha):
         reg = 0
         if cr_reg:
             reg += (
-                0.5 * np.linalg.slogdet((X.T[:, :, None] * W).transpose(2, 0, 1) @ X)[1]
+                0.5
+                * np.linalg.slogdet(
+                    (design_matrix.T[:, :, None] * W).transpose(2, 0, 1) @ design_matrix
+                )[1]
             )
         if prior_reg:
             reg += (np.log(alpha) - np.log(alpha_hat)) ** 2 / (2 * prior_disp_var)
-        return vec_nb_nll(y, mu, alpha) + reg
+        return vec_nb_nll(counts, mu, alpha) + reg
 
     ll_grid = loss(grid)
 
@@ -135,7 +140,7 @@ def loss(log_alpha):
 def grid_fit_beta(
     counts,
     size_factors,
-    X,
+    design_matrix,
     disp,
     min_mu=0.5,
     grid_length=60,
@@ -155,7 +160,7 @@ def grid_fit_beta(
     size_factors : pandas.Series
         DESeq2 normalization factors.
 
-    X : ndarray
+    design_matrix : ndarray
         Design matrix.
 
     disp : float
@@ -184,7 +189,7 @@ def grid_fit_beta(
 
     def loss(beta):
         # closure to minimize
-        mu = np.maximum(size_factors[:, None] * np.exp(X @ beta.T), min_mu)
+        mu = np.maximum(size_factors[:, None] * np.exp(design_matrix @ beta.T), min_mu)
         return vec_nb_nll(counts, mu, disp) + 0.5 * (1e-6 * beta**2).sum(1)
 
     for i, x in enumerate(x_grid):
diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
index 25feee73..cdcd65aa 100644
--- a/pydeseq2/utils.py
+++ b/pydeseq2/utils.py
@@ -234,14 +234,14 @@ def dispersion_trend(normed_mean, coeffs):
         return coeffs[0] + coeffs[1] / normed_mean
 
 
-def nb_nll(y, mu, alpha):
+def nb_nll(counts, mu, alpha):
     """Negative log-likelihood of a negative binomial.
 
     Unvectorized version.
 
     Parameters
     ----------
-    y : ndarray
+    counts : ndarray
         Observations.
 
     mu : float
@@ -254,27 +254,31 @@ def nb_nll(y, mu, alpha):
     Returns
     -------
     float
-        Negative log likelihood of the observations y
+        Negative log likelihood of the observations counts
         following :math:`NB(\\mu, \\alpha)`.
     """
 
-    n = len(y)
+    n = len(counts)
     alpha_neg1 = 1 / alpha
-    logbinom = gammaln(y + alpha_neg1) - gammaln(y + 1) - gammaln(alpha_neg1)
+    logbinom = gammaln(counts + alpha_neg1) - gammaln(counts + 1) - gammaln(alpha_neg1)
     return (
         n * alpha_neg1 * np.log(alpha)
-        + (-logbinom + (y + alpha_neg1) * np.log(alpha_neg1 + mu) - y * np.log(mu)).sum()
+        + (
+            -logbinom
+            + (counts + alpha_neg1) * np.log(alpha_neg1 + mu)
+            - counts * np.log(mu)
+        ).sum()
     )
 
 
-def dnb_nll(y, mu, alpha):
+def dnb_nll(counts, mu, alpha):
     """Gradient of the negative log-likelihood of a negative binomial.
 
     Unvectorized.
 
     Parameters
     ----------
-    y : ndarray
+    counts : ndarray
         Observations.
 
     mu : float
@@ -295,9 +299,9 @@ def dnb_nll(y, mu, alpha):
         alpha_neg1**2
         * (
             polygamma(0, alpha_neg1)
-            - polygamma(0, y + alpha_neg1)
+            - polygamma(0, counts + alpha_neg1)
             + np.log(1 + mu * alpha)
-            + (y - mu) / (mu + alpha_neg1)
+            + (counts - mu) / (mu + alpha_neg1)
         ).sum()
     )
 
@@ -364,7 +368,7 @@ def irls_solver(
 
     mu: ndarray
         Means estimated from size factors and beta:
-        :math:`\\mu = s_{ij} \\exp(\\beta^t X)`.
+        :math:`\\mu = s_{ij} \\exp(\\beta^t design_matrix)`.
 
     H: ndarray
         Diagonal of the :math:`W^{1/2} X (X^t W X)^-1 X^t W^{1/2}` covariance matrix.
@@ -466,8 +470,8 @@ def df(beta):
 
 
 def fit_alpha_mle(
-    y,
-    X,
+    counts,
+    design_matrix,
     mu,
     alpha_hat,
     min_disp,
@@ -479,15 +483,15 @@ def fit_alpha_mle(
 ):
     """Estimate the dispersion parameter of a negative binomial GLM.
 
-    Note: it is possible to pass pandas Series as y, X and mu arguments but using numpy
-    arrays makes the code significantly faster.
+    Note: it is possible to pass counts, design_matrix and mu arguments in the form of
+    pandas Series, but using numpy arrays makes the code significantly faster.
 
     Parameters
     ----------
-    y : ndarray
+    counts : ndarray
         Raw counts for a given gene.
 
-    X : ndarray
+    design_matrix : ndarray
         Design matrix.
 
     mu : ndarray
@@ -537,10 +541,10 @@ def loss(log_alpha):
         W = mu / (1 + mu * alpha)
         reg = 0
         if cr_reg:
-            reg += 0.5 * np.linalg.slogdet((X.T * W) @ X)[1]
+            reg += 0.5 * np.linalg.slogdet((design_matrix.T * W) @ design_matrix)[1]
         if prior_reg:
             reg += (np.log(alpha) - np.log(alpha_hat)) ** 2 / (2 * prior_disp_var)
-        return nb_nll(y, mu, alpha) + reg
+        return nb_nll(counts, mu, alpha) + reg
 
     def dloss(log_alpha):
         # gradient closure
@@ -549,10 +553,16 @@ def dloss(log_alpha):
         dW = -(W**2)
         reg_grad = 0
         if cr_reg:
-            reg_grad += 0.5 * (np.linalg.inv((X.T * W) @ X) * ((X.T * dW) @ X)).sum()
+            reg_grad += (
+                0.5
+                * (
+                    np.linalg.inv((design_matrix.T * W) @ design_matrix)
+                    * ((design_matrix.T * dW) @ design_matrix)
+                ).sum()
+            )
         if prior_reg:
             reg_grad += (np.log(alpha) - np.log(alpha_hat)) / (alpha * prior_disp_var)
-        return dnb_nll(y, mu, alpha) + reg_grad
+        return dnb_nll(counts, mu, alpha) + reg_grad
 
     res = minimize(
         lambda x: loss(x[0]),
@@ -568,7 +578,9 @@ def dloss(log_alpha):
         return np.exp(res.x[0]), res.success
     else:
         return (
-            np.exp(grid_fit_alpha(y, X, mu, alpha_hat, min_disp, max_disp)),
+            np.exp(
+                grid_fit_alpha(counts, design_matrix, mu, alpha_hat, min_disp, max_disp)
+            ),
             res.success,
         )
 
@@ -685,20 +697,20 @@ def trimmed_variance(x, trim=0.125, axis=1):
     return 1.51 * trimmed_mean(sqerror, trim=trim, axis=axis)
 
 
-def fit_lin_mu(y, size_factors, X, min_mu=0.5):
+def fit_lin_mu(counts, size_factors, design_matrix, min_mu=0.5):
     """Estimate mean of negative binomial model using a linear regression.
 
     Used to initialize genewise dispersion models.
 
     Parameters
     ----------
-    y : ndarray
+    counts : ndarray
         Raw counts for a given gene.
 
     size_factors : ndarray
         Sample-wise scaling factors (obtained from median-of-ratios).
 
-    X : ndarray
+    design_matrix : ndarray
         Design matrix.
 
     min_mu : float
@@ -711,13 +723,13 @@ def fit_lin_mu(y, size_factors, X, min_mu=0.5):
     """
 
     reg = LinearRegression(fit_intercept=False)
-    reg.fit(X, y / size_factors)
-    mu_hat = size_factors * reg.predict(X)
+    reg.fit(design_matrix, counts / size_factors)
+    mu_hat = size_factors * reg.predict(design_matrix)
     # Threshold mu_hat as 1/mu_hat will be used later on.
     return np.maximum(mu_hat, min_mu)
 
 
-def wald_test(X, disp, lfc, mu, ridge_factor, idx=-1):
+def wald_test(design_matrix, disp, lfc, mu, ridge_factor, idx=-1):
     """Run Wald test for differential expression.
 
     Computes Wald statistics, standard error and p-values from
@@ -725,7 +737,7 @@ def wald_test(X, disp, lfc, mu, ridge_factor, idx=-1):
 
     Parameters
     ----------
-    X : ndarray
+    design_matrix : ndarray
         Design matrix.
 
     disp : float
@@ -757,7 +769,7 @@ def wald_test(X, disp, lfc, mu, ridge_factor, idx=-1):
 
     # Build covariance matrix estimator
     W = np.diag(mu / (1 + mu * disp))
-    M = X.T @ W @ X
+    M = design_matrix.T @ W @ design_matrix
     H = np.linalg.inv(M + ridge_factor)
     S = H @ M @ H
     # Evaluate standard error and Wald statistic

From 4be92a92447d54c35406f34fe764fc54db22dc72 Mon Sep 17 00:00:00 2001
From: Boris MUZELLEC <boris.muzellec@owkin.com>
Date: Mon, 9 Jan 2023 14:04:01 +0100
Subject: [PATCH 25/25] refactor: change function/variable names to more
 meanignful ones

---
 notebooks/PyDESeq2_minimal_example.ipynb       |  6 +++---
 notebooks/PyDESeq2_step_by_step_pipeline.ipynb |  6 +++---
 pydeseq2/DeseqDataSet.py                       |  8 ++++----
 pydeseq2/utils.py                              | 18 ++++++++++--------
 4 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/notebooks/PyDESeq2_minimal_example.ipynb b/notebooks/PyDESeq2_minimal_example.ipynb
index fbb201d4..4079415c 100644
--- a/notebooks/PyDESeq2_minimal_example.ipynb
+++ b/notebooks/PyDESeq2_minimal_example.ipynb
@@ -36,7 +36,7 @@
     "\n",
     "from pydeseq2.DeseqDataSet import DeseqDataSet\n",
     "from pydeseq2.DeseqStats import DeseqStats\n",
-    "from pydeseq2.utils import load_data"
+    "from pydeseq2.utils import load_example_data"
    ]
   },
   {
@@ -89,7 +89,7 @@
    },
    "outputs": [],
    "source": [
-    "counts_df = load_data(\n",
+    "counts_df = load_example_data(\n",
     "    modality=\"raw_counts\",\n",
     "    cancer_type=CANCER,\n",
     "    debug=False,\n",
@@ -103,7 +103,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "clinical_df = load_data(\n",
+    "clinical_df = load_example_data(\n",
     "    modality=\"clinical\",\n",
     "    cancer_type=CANCER,\n",
     "    debug=False,\n",
diff --git a/notebooks/PyDESeq2_step_by_step_pipeline.ipynb b/notebooks/PyDESeq2_step_by_step_pipeline.ipynb
index b338ab8b..b9ea014a 100644
--- a/notebooks/PyDESeq2_step_by_step_pipeline.ipynb
+++ b/notebooks/PyDESeq2_step_by_step_pipeline.ipynb
@@ -36,7 +36,7 @@
     "\n",
     "from pydeseq2.DeseqDataSet import DeseqDataSet\n",
     "from pydeseq2.DeseqStats import DeseqStats\n",
-    "from pydeseq2.utils import load_data"
+    "from pydeseq2.utils import load_example_data"
    ]
   },
   {
@@ -87,7 +87,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "counts_df = load_data(\n",
+    "counts_df = load_example_data(\n",
     "    modality=\"raw_counts\",\n",
     "    cancer_type=CANCER,\n",
     "    debug=False,\n",
@@ -101,7 +101,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "clinical_df = load_data(\n",
+    "clinical_df = load_example_data(\n",
     "    modality=\"clinical\",\n",
     "    cancer_type=CANCER,\n",
     "    debug=False,\n",
diff --git a/pydeseq2/DeseqDataSet.py b/pydeseq2/DeseqDataSet.py
index 5e2cfed8..db643193 100644
--- a/pydeseq2/DeseqDataSet.py
+++ b/pydeseq2/DeseqDataSet.py
@@ -184,20 +184,20 @@ def __init__(
 
         # Import clinical data and convert design_column to string
         self.clinical = clinical.loc[self.counts.index]
-        # Convert design factor to list if a single string was provided.
+        # Convert design_factors to list if a single string was provided.
         self.design_factors = (
             [design_factors] if isinstance(design_factors, str) else design_factors
         )
         if self.clinical[self.design_factors].isna().any().any():
-            raise ValueError("NaNs are not allowed in the design factor.")
+            raise ValueError("NaNs are not allowed in the design factors.")
         self.clinical[self.design_factors] = self.clinical[self.design_factors].astype(
             str
         )
 
         # Build the design matrix (splits the dataset in cohorts)
         self.design_matrix = build_design_matrix(
-            self.clinical,
-            design_factors,
+            clinical_df=self.clinical,
+            design_factors=self.design_factors,
             ref=reference_level,
             expanded=False,
             intercept=True,
diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
index cdcd65aa..4e9d61e6 100644
--- a/pydeseq2/utils.py
+++ b/pydeseq2/utils.py
@@ -17,7 +17,7 @@
 from pydeseq2.grid_search import grid_fit_shrink_beta
 
 
-def load_data(
+def load_example_data(
     modality="raw_counts",
     cancer_type="synthetic",
     debug=False,
@@ -138,7 +138,7 @@ def test_valid_counts(counts_df):
 
 
 def build_design_matrix(
-    clinical_df, design="high_grade", ref=None, expanded=False, intercept=True
+    clinical_df, design_factors="high_grade", ref=None, expanded=False, intercept=True
 ):
     """Build design_matrix matrix for DEA.
 
@@ -152,7 +152,7 @@ def build_design_matrix(
         DataFrame containing clinical information.
         Must be indexed by sample barcodes, and contain a "high_grade" column.
 
-    design : str or list[str]
+    design_factors : str or list[str]
         Name of the columns of clinical_df to be used as design_matrix variables.
         (default: "high_grade").
 
@@ -175,19 +175,21 @@ def build_design_matrix(
         Indexed by sample barcodes.
     """
 
-    if isinstance(design, str):  # if there is a single factor, convert to singleton list
-        design = [design]
+    if isinstance(
+        design_factors, str
+    ):  # if there is a single factor, convert to singleton list
+        design_factors = [design_factors]
 
-    design_matrix = pd.get_dummies(clinical_df[design])
+    design_matrix = pd.get_dummies(clinical_df[design_factors])
 
-    for factor in design:
+    for factor in design_factors:
         # Check that each factor has exactly 2 levels
         if len(np.unique(clinical_df[factor])) != 2:
             raise ValueError(
                 f"Factors should take exactly two values, but {factor} "
                 f"takes values {np.unique(clinical_df[factor])}."
             )
-    factor = design[-1]
+    factor = design_factors[-1]
     if ref is None:
         ref = "_".join([factor, np.sort(np.unique(clinical_df[factor]).astype(str))[0]])
         ref_level = design_matrix.pop(ref)