diff --git a/econml/dml.py b/econml/dml.py
index 4f7fe54b0..678d0c6fd 100644
--- a/econml/dml.py
+++ b/econml/dml.py
@@ -359,22 +359,22 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn
         The estimator for fitting the response residuals to the treatment residuals. Must implement
         `fit` and `predict` methods, and must be a linear model for correctness.
 
-    featurizer: :term:`transformer`, optional, default None
+    featurizer: :term:`transformer`, default None
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
         It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
         If featurizer=None, then CATE is trained on X.
 
-    fit_cate_intercept : bool, optional, default True
+    fit_cate_intercept : bool, default True
         Whether the linear CATE model should have a constant term.
 
     linear_first_stages: bool
         Whether the first stage models are linear (in which case we will expand the features passed to
         `model_y` accordingly)
 
-    discrete_treatment: bool, optional, default False
+    discrete_treatment: bool, default False
         Whether the treatment values should be treated as categorical, rather than continuous, quantities
 
-    n_splits: int, cross-validation generator or an iterable, optional, default 2
+    n_splits: int, cross-validation generator or an iterable, default 2
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -391,7 +391,7 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, default None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
         If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
@@ -432,30 +432,30 @@ class LinearDMLCateEstimator(StatsModelsCateEstimatorMixin, DMLCateEstimator):
 
     Parameters
     ----------
-    model_y: estimator, optional (default is :class:`.WeightedLassoCVWrapper`)
+    model_y: estimator, default :class:`.WeightedLassoCVWrapper`
         The estimator for fitting the response to the features. Must implement
         `fit` and `predict` methods.
 
-    model_t: estimator or 'auto', optional (default is 'auto')
+    model_t: estimator or 'auto', default 'auto'
         The estimator for fitting the treatment to the features.
         If estimator, it must implement `fit` and `predict` methods;
         If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV` will be applied for discrete treatment,
         and :class:`.WeightedLassoCV`/:class:`.WeightedMultiTaskLassoCV`
         will be applied for continuous treatment.
 
-    featurizer : :term:`transformer`, optional, default None
+    featurizer : :term:`transformer`, default None
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
         It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
         If featurizer=None, then CATE is trained on X.
 
-    fit_cate_intercept : bool, optional, default True
+    fit_cate_intercept : bool, default True
         Whether the linear CATE model should have a constant term.
 
     linear_first_stages: bool
         Whether the first stage models are linear (in which case we will expand the features passed to
         `model_y` accordingly)
 
-    discrete_treatment: bool, optional (default is ``False``)
+    discrete_treatment: bool, default ``False``
         Whether the treatment values should be treated as categorical, rather than continuous, quantities
 
     n_splits: int, cross-validation generator or an iterable, optional (Default=2)
@@ -474,7 +474,7 @@ class LinearDMLCateEstimator(StatsModelsCateEstimatorMixin, DMLCateEstimator):
 
         Unless an iterable is used, we call `split(X,T)` to generate the splits.
 
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, default None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
         If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
@@ -541,12 +541,12 @@ class SparseLinearDMLCateEstimator(DebiasedLassoCateEstimatorMixin, DMLCateEstim
 
     Parameters
     ----------
-    model_y: estimator, optional (default is :class:`WeightedLassoCVWrapper()
+    model_y: estimator, default :class:`WeightedLassoCVWrapper(
         <econml.sklearn_extensions.linear_model.WeightedLassoCVWrapper>`)
         The estimator for fitting the response to the features. Must implement
         `fit` and `predict` methods.
 
-    model_t: estimator or 'auto', optional (default is 'auto')
+    model_t: estimator or 'auto', default 'auto'
         The estimator for fitting the treatment to the features.
         If estimator, it must implement `fit` and `predict` methods, and must be a
         linear model for correctness;
@@ -556,32 +556,32 @@ class SparseLinearDMLCateEstimator(DebiasedLassoCateEstimatorMixin, DMLCateEstim
         :class:`.WeightedMultiTaskLassoCV`
         will be applied for continuous treatment.
 
-    alpha: string | float, optional. Default='auto'.
+    alpha: string or float, default 'auto'
         CATE L1 regularization applied through the debiased lasso in the final model.
         'auto' corresponds to a CV form of the :class:`MultiOutputDebiasedLasso`.
 
-    max_iter : int, optional, default=1000
+    max_iter : int, default 1000
         The maximum number of iterations in the Debiased Lasso
 
-    tol : float, optional, default=1e-4
+    tol : float, default 1e-4
         The tolerance for the optimization: if the updates are
         smaller than ``tol``, the optimization code checks the
         dual gap for optimality and continues until it is smaller
         than ``tol``.
 
-    featurizer : :term:`transformer`, optional, default None
+    featurizer : :term:`transformer`, default None
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
         It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
         If featurizer=None, then CATE is trained on X.
 
-    fit_cate_intercept : bool, optional, default True
+    fit_cate_intercept : bool, default True
         Whether the linear CATE model should have a constant term.
 
     linear_first_stages: bool
         Whether the first stage models are linear (in which case we will expand the features passed to
         `model_y` accordingly)
 
-    discrete_treatment: bool, optional (default is ``False``)
+    discrete_treatment: bool, default ``False``
         Whether the treatment values should be treated as categorical, rather than continuous, quantities
 
     n_splits: int, cross-validation generator or an iterable, optional (Default=2)
@@ -600,7 +600,7 @@ class SparseLinearDMLCateEstimator(DebiasedLassoCateEstimatorMixin, DMLCateEstim
 
         Unless an iterable is used, we call `split(X,T)` to generate the splits.
 
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, default None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
         If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
@@ -678,11 +678,11 @@ class KernelDMLCateEstimator(DMLCateEstimator):
 
     Parameters
     ----------
-    model_y: estimator, optional (default is :class:`<econml.sklearn_extensions.linear_model.WeightedLassoCVWrapper>`)
+    model_y: estimator, default :class:`<econml.sklearn_extensions.linear_model.WeightedLassoCVWrapper>`
         The estimator for fitting the response to the features. Must implement
         `fit` and `predict` methods.
 
-    model_t: estimator or 'auto', optional (default is 'auto')
+    model_t: estimator or 'auto', default 'auto'
         The estimator for fitting the treatment to the features.
         If estimator, it must implement `fit` and `predict` methods;
         If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV`
@@ -691,16 +691,16 @@ class KernelDMLCateEstimator(DMLCateEstimator):
         :class:`.WeightedMultiTaskLassoCV`
         will be applied for continuous treatment.
 
-    fit_cate_intercept : bool, optional, default True
+    fit_cate_intercept : bool, default True
         Whether the linear CATE model should have a constant term.
 
-    dim: int, optional (default is 20)
+    dim: int, default 20
         The number of random Fourier features to generate
 
-    bw: float, optional (default is 1.0)
+    bw: float, default 1.0
         The bandwidth of the Gaussian used to generate features
 
-    discrete_treatment: bool, optional (default is ``False``)
+    discrete_treatment: bool, default ``False``
         Whether the treatment values should be treated as categorical, rather than continuous, quantities
 
     n_splits: int, cross-validation generator or an iterable, optional (Default=2)
@@ -719,7 +719,7 @@ class KernelDMLCateEstimator(DMLCateEstimator):
 
         Unless an iterable is used, we call `split(X,T)` to generate the splits.
 
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, default None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
         If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
@@ -773,7 +773,7 @@ class NonParamDMLCateEstimator(_BaseDMLCateEstimator):
         The transformer used to featurize the raw features when fitting the final model.  Must implement
         a `fit_transform` method.
 
-    discrete_treatment: bool, optional (default is ``False``)
+    discrete_treatment: bool, default ``False``
         Whether the treatment values should be treated as categorical, rather than continuous, quantities
 
     n_splits: int, cross-validation generator or an iterable, optional (Default=2)
@@ -793,7 +793,7 @@ class NonParamDMLCateEstimator(_BaseDMLCateEstimator):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, default None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
         If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
@@ -821,7 +821,8 @@ def __init__(self,
 
 
 class ForestDMLCateEstimator(NonParamDMLCateEstimator):
-    """ Instance of NonParamDMLCateEstimator with a
+    """ 
+    Instance of NonParamDMLCateEstimator with a
     :class:`~econml.sklearn_extensions.ensemble.SubsampledHonestForest`
     as a final model, so as to enable non-parametric inference.
 
@@ -835,7 +836,7 @@ class ForestDMLCateEstimator(NonParamDMLCateEstimator):
         The estimator for fitting the treatment to the features. Must implement
         `fit` and `predict` methods.  Must be a linear model for correctness when linear_first_stages is ``True``.
 
-    discrete_treatment: bool, optional (default is ``False``)
+    discrete_treatment: bool, default ``False``
         Whether the treatment values should be treated as categorical, rather than continuous, quantities
 
     n_crossfit_splits: int, cross-validation generator or an iterable, optional (Default=2)
@@ -855,23 +856,23 @@ class ForestDMLCateEstimator(NonParamDMLCateEstimator):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
-    n_estimators : integer, optional (default=100)
+    n_estimators : integer, default 100
         The total number of trees in the forest. The forest consists of a
         forest of sqrt(n_estimators) sub-forests, where each sub-forest
         contains sqrt(n_estimators) trees.
 
-    criterion : string, optional (default="mse")
+    criterion : string, default "mse"
         The function to measure the quality of a split. Supported criteria
         are "mse" for the mean squared error, which is equal to variance
         reduction as feature selection criterion, and "mae" for the mean
         absolute error.
 
-    max_depth : integer or None, optional (default=None)
+    max_depth : integer or None, default None
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
         min_samples_split samples.
 
-    min_samples_split : int, float, optional (default=2)
+    min_samples_split : int, float, default 2
         The minimum number of splitting samples required to split an internal node.
 
         - If int, then consider `min_samples_split` as the minimum number.
@@ -879,7 +880,7 @@ class ForestDMLCateEstimator(NonParamDMLCateEstimator):
           `ceil(min_samples_split * n_samples)` are the minimum
           number of samples for each split.
 
-    min_samples_leaf : int, float, optional (default=1)
+    min_samples_leaf : int, float, default 1
         The minimum number of samples required to be at a leaf node.
         A split point at any depth will only be considered if it leaves at
         least ``min_samples_leaf`` splitting samples in each of the left and
@@ -893,7 +894,7 @@ class ForestDMLCateEstimator(NonParamDMLCateEstimator):
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
-    min_weight_fraction_leaf : float, optional (default=0.)
+    min_weight_fraction_leaf : float, default 0
         The minimum weighted fraction of the sum total of weights (of all
         splitting samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided. After construction
@@ -901,7 +902,7 @@ class ForestDMLCateEstimator(NonParamDMLCateEstimator):
         of the estimation samples contained in each leaf node is at
         least min_weight_fraction_leaf
 
-    max_features : int, float, string or None, optional (default="auto")
+    max_features : int, float, string or None, default "auto"
         The number of features to consider when looking for the best split:
 
         - If int, then consider `max_features` features at each split.
@@ -917,12 +918,12 @@ class ForestDMLCateEstimator(NonParamDMLCateEstimator):
         valid partition of the node samples is found, even if it requires to
         effectively inspect more than ``max_features`` features.
 
-    max_leaf_nodes : int or None, optional (default=None)
+    max_leaf_nodes : int or None, default None
         Grow trees with ``max_leaf_nodes`` in best-first fashion.
         Best nodes are defined as relative reduction in impurity.
         If None then unlimited number of leaf nodes.
 
-    min_impurity_decrease : float, optional (default=0.)
+    min_impurity_decrease : float, default 0.
         A node will be split if this split induces a decrease of the impurity
         greater than or equal to this value.
 
@@ -938,7 +939,7 @@ class ForestDMLCateEstimator(NonParamDMLCateEstimator):
         ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
         if ``sample_weight`` is passed.
 
-    subsample_fr : float or 'auto', optional (default='auto')
+    subsample_fr : float or 'auto', default 'auto'
         The fraction of the half-samples that are used on each tree. Each tree
         will be built on subsample_fr * n_samples/2.
 
@@ -948,21 +949,21 @@ class ForestDMLCateEstimator(NonParamDMLCateEstimator):
 
         which is sufficient to guarantee asympotitcally valid inference.
 
-    honest : boolean, optional (default=True)
+    honest : boolean, default True
         Whether to use honest trees, i.e. half of the samples are used for
         creating the tree structure and the other half for the estimation at
         the leafs. If False, then all samples are used for both parts.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int or None, default None
         The number of jobs to run in parallel for both `fit` and `predict`.
         ``None`` means 1 unless in a :func:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
-    verbose : int, optional (default=0)
+    verbose : int, default 0
         Controls the verbosity when fitting and predicting.
 
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, default None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
         If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
diff --git a/econml/drlearner.py b/econml/drlearner.py
index 1223ead3b..43d5a586b 100644
--- a/econml/drlearner.py
+++ b/econml/drlearner.py
@@ -115,18 +115,18 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc
           mono-task model and a separate clone of the model is trained for each outcome. Then predict(X) of the t-th
           clone will be the CATE of the t-th lexicographically ordered treatment compared to the baseline.
 
-    multitask_model_final : bool, optional, default False
+    multitask_model_final : bool, default False
         Whether the model_final should be treated as a multi-task model. See description of model_final.
 
-    featurizer : :term:`transformer`, optional, default None
+    featurizer : :term:`transformer`, default None
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
         It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
         If featurizer=None, then CATE is trained on X.
 
-    min_propensity : float, optional, default ``1e-6``
+    min_propensity : float, default ``1e-6``
         The minimum propensity at which to clip propensity estimates to avoid dividing by zero.
 
-    n_splits: int, cross-validation generator or an iterable, optional (default is 2)
+    n_splits: int, cross-validation generator or an iterable, default 2
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -553,19 +553,19 @@ class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner):
         `predict` methods. If different models per treatment arm are desired, see the
         :class:`.MultiModelWrapper` helper class.
 
-    featurizer : :term:`transformer`, optional, default None
+    featurizer : :term:`transformer`, default None
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
         It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
         If featurizer=None, then CATE is trained on X.
 
-    fit_cate_intercept : bool, optional, default True
+    fit_cate_intercept : bool, default True
         Whether the linear CATE model should have a constant term.
 
 
-    min_propensity : float, optional, default ``1e-6``
+    min_propensity : float, default ``1e-6``
         The minimum propensity at which to clip propensity estimates to avoid dividing by zero.
 
-    n_splits: int, cross-validation generator or an iterable, optional (default is 2)
+    n_splits: int, cross-validation generator or an iterable, default 2
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -736,31 +736,31 @@ class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
         `predict` methods. If different models per treatment arm are desired, see the
         :class:`.MultiModelWrapper` helper class.
 
-    featurizer : :term:`transformer`, optional, default None
+    featurizer : :term:`transformer`, default None
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
         It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
         If featurizer=None, then CATE is trained on X.
 
-    fit_cate_intercept : bool, optional, default True
+    fit_cate_intercept : bool, default True
         Whether the linear CATE model should have a constant term.
 
     alpha: string | float, optional., default 'auto'.
         CATE L1 regularization applied through the debiased lasso in the final model.
         'auto' corresponds to a CV form of the :class:`DebiasedLasso`.
 
-    max_iter : int, optional, default 1000
+    max_iter : int, default 1000
         The maximum number of iterations in the Debiased Lasso
 
-    tol : float, optional, default 1e-4
+    tol : float, default 1e-4
         The tolerance for the optimization: if the updates are
         smaller than ``tol``, the optimization code checks the
         dual gap for optimality and continues until it is smaller
         than ``tol``.
 
-    min_propensity : float, optional, default ``1e-6``
+    min_propensity : float, default ``1e-6``
         The minimum propensity at which to clip propensity estimates to avoid dividing by zero.
 
-    n_splits: int, cross-validation generator or an iterable, optional, default 2
+    n_splits: int, cross-validation generator or an iterable, default 2
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
 
@@ -921,7 +921,7 @@ class ForestDRLearner(DRLearner):
         `predict` methods. If different models per treatment arm are desired, see the
         :class:`~econml.utilities.MultiModelWrapper` helper class.
 
-    min_propensity : float, optional, default ``1e-6``
+    min_propensity : float, default ``1e-6``
         The minimum propensity at which to clip propensity estimates to avoid dividing by zero.
 
     n_crossfit_splits: int, cross-validation generator or an iterable, optional (Default=2)
@@ -941,23 +941,23 @@ class ForestDRLearner(DRLearner):
         Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
         W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
 
-    n_estimators : integer, optional (default=100)
+    n_estimators : integer, default 100
         The total number of trees in the forest. The forest consists of a
         forest of sqrt(n_estimators) sub-forests, where each sub-forest
         contains sqrt(n_estimators) trees.
 
-    criterion : string, optional (default="mse")
+    criterion : string, default "mse"
         The function to measure the quality of a split. Supported criteria
         are "mse" for the mean squared error, which is equal to variance
         reduction as feature selection criterion, and "mae" for the mean
         absolute error.
 
-    max_depth : integer or None, optional (default=None)
+    max_depth : integer or None, default None
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
         min_samples_split samples.
 
-    min_samples_split : int, float, optional (default=2)
+    min_samples_split : int, float, default 2
         The minimum number of splitting samples required to split an internal node.
 
         - If int, then consider `min_samples_split` as the minimum number.
@@ -965,7 +965,7 @@ class ForestDRLearner(DRLearner):
           `ceil(min_samples_split * n_samples)` are the minimum
           number of samples for each split.
 
-    min_samples_leaf : int, float, optional (default=1)
+    min_samples_leaf : int, float, default 1
         The minimum number of samples required to be at a leaf node.
         A split point at any depth will only be considered if it leaves at
         least ``min_samples_leaf`` splitting samples in each of the left and
@@ -979,7 +979,7 @@ class ForestDRLearner(DRLearner):
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
-    min_weight_fraction_leaf : float, optional (default=0.)
+    min_weight_fraction_leaf : float, default 0.
         The minimum weighted fraction of the sum total of weights (of all
         splitting samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided. After construction
@@ -987,7 +987,7 @@ class ForestDRLearner(DRLearner):
         of the estimation samples contained in each leaf node is at
         least min_weight_fraction_leaf
 
-    max_features : int, float, string or None, optional (default="auto")
+    max_features : int, float, string or None, default "auto"
         The number of features to consider when looking for the best split:
 
         - If int, then consider `max_features` features at each split.
@@ -1003,12 +1003,12 @@ class ForestDRLearner(DRLearner):
         valid partition of the node samples is found, even if it requires to
         effectively inspect more than ``max_features`` features.
 
-    max_leaf_nodes : int or None, optional (default=None)
+    max_leaf_nodes : int or None, default None
         Grow trees with ``max_leaf_nodes`` in best-first fashion.
         Best nodes are defined as relative reduction in impurity.
         If None then unlimited number of leaf nodes.
 
-    min_impurity_decrease : float, optional (default=0.)
+    min_impurity_decrease : float, default 0.
         A node will be split if this split induces a decrease of the impurity
         greater than or equal to this value.
 
@@ -1024,7 +1024,7 @@ class ForestDRLearner(DRLearner):
         ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
         if ``sample_weight`` is passed.
 
-    subsample_fr : float or 'auto', optional (default='auto')
+    subsample_fr : float or 'auto', default 'auto'
         The fraction of the half-samples that are used on each tree. Each tree
         will be built on subsample_fr * n_samples/2.
 
@@ -1034,21 +1034,21 @@ class ForestDRLearner(DRLearner):
 
         which is sufficient to guarantee asympotitcally valid inference.
 
-    honest : boolean, optional (default=True)
+    honest : boolean, default True
         Whether to use honest trees, i.e. half of the samples are used for
         creating the tree structure and the other half for the estimation at
         the leafs. If False, then all samples are used for both parts.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int or None, default None
         The number of jobs to run in parallel for both `fit` and `predict`.
         ``None`` means 1 unless in a :func:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
-    verbose : int, optional (default=0)
+    verbose : int, default 0
         Controls the verbosity when fitting and predicting.
 
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, default None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
         If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
diff --git a/econml/ortho_forest.py b/econml/ortho_forest.py
index 378763237..079c9617c 100644
--- a/econml/ortho_forest.py
+++ b/econml/ortho_forest.py
@@ -419,52 +419,52 @@ class ContinuousTreatmentOrthoForest(BaseOrthoForest):
 
     Parameters
     ----------
-    n_trees : integer, optional (default=500)
+    n_trees : integer, default 500
         Number of causal estimators in the forest.
 
-    min_leaf_size : integer, optional (default=10)
+    min_leaf_size : integer, default 10
         The minimum number of samples in a leaf.
 
-    max_depth : integer, optional (default=10)
+    max_depth : integer, default 10
         The maximum number of splits to be performed when expanding the tree.
 
-    subsample_ratio : float, optional (default=0.7)
+    subsample_ratio : float, default 0.7
         The ratio of the total sample to be used when training a causal tree.
         Values greater than 1.0 will be considered equal to 1.0.
         Parameter is ignored when bootstrap=True.
 
-    bootstrap : boolean, optional (default=False)
+    bootstrap : boolean, default False
         Whether to use bootstrap subsampling.
 
-    lambda_reg : float, optional (default=0.01)
+    lambda_reg : float, default 0.01
         The regularization coefficient in the ell_2 penalty imposed on the
         locally linear part of the second stage fit. This is not applied to
         the local intercept, only to the coefficient of the linear component.
 
-    model_T : estimator, optional (default=sklearn.linear_model.LassoCV(cv=3))
+    model_T : estimator, default ``sklearn.linear_model.LassoCV(cv=3)``
         The estimator for residualizing the continuous treatment at each leaf.
         Must implement `fit` and `predict` methods.
 
-    model_Y :  estimator, optional (default=sklearn.linear_model.LassoCV(cv=3)
+    model_Y :  estimator, default ``sklearn.linear_model.LassoCV(cv=3)``
         The estimator for residualizing the outcome at each leaf. Must implement
         `fit` and `predict` methods.
 
-    model_T_final : estimator, optional (default=None)
+    model_T_final : estimator, default None
         The estimator for residualizing the treatment at prediction time. Must implement
         `fit` and `predict` methods. If parameter is set to ``None``, it defaults to the
         value of `model_T` parameter.
 
-    model_Y_final : estimator, optional (default=None)
+    model_Y_final : estimator, default None
         The estimator for residualizing the outcome at prediction time. Must implement
         `fit` and `predict` methods. If parameter is set to ``None``, it defaults to the
         value of `model_Y` parameter.
 
-    n_jobs : int, optional (default=-1)
+    n_jobs : int, default -1
         The number of jobs to run in parallel for both :meth:`fit` and :meth:`effect`.
         ``-1`` means using all processors. Since OrthoForest methods are
         computationally heavy, it is recommended to set `n_jobs` to -1.
 
-    random_state : int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+    random_state : int, :class:`~numpy.random.mtrand.RandomState` instance or None, default None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
         If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
@@ -641,58 +641,58 @@ class DiscreteTreatmentOrthoForest(BaseOrthoForest):
 
     Parameters
     ----------
-    n_trees : integer, optional (default=500)
+    n_trees : integer, default 500
         Number of causal estimators in the forest.
 
-    min_leaf_size : integer, optional (default=10)
+    min_leaf_size : integer, default 10
         The minimum number of samples in a leaf.
 
-    max_depth : integer, optional (default=10)
+    max_depth : integer, default 10
         The maximum number of splits to be performed when expanding the tree.
 
-    subsample_ratio : float, optional (default=0.7)
+    subsample_ratio : float, default 0.7
         The ratio of the total sample to be used when training a causal tree.
         Values greater than 1.0 will be considered equal to 1.0.
         Parameter is ignored when bootstrap=True.
 
-    bootstrap : boolean, optional (default=False)
+    bootstrap : boolean, default False
         Whether to use bootstrap subsampling.
 
-    lambda_reg : float, optional (default=0.01)
+    lambda_reg : float, default 0.01
         The regularization coefficient in the ell_2 penalty imposed on the
         locally linear part of the second stage fit. This is not applied to
         the local intercept, only to the coefficient of the linear component.
 
-    propensity_model : estimator, optional (default=sklearn.linear_model.LogisticRegression(penalty='l1',\
+    propensity_model : estimator, default ``sklearn.linear_model.LogisticRegression(penalty='l1',\
                                                                                              solver='saga',\
-                                                                                             multi_class='auto'))
+                                                                                             multi_class='auto')``
         Model for estimating propensity of treatment at each leaf.
         Will be trained on features and controls (concatenated). Must implement `fit` and `predict_proba` methods.
 
-    model_Y :  estimator, optional (default=sklearn.linear_model.LassoCV(cv=3))
+    model_Y :  estimator, default ``sklearn.linear_model.LassoCV(cv=3)``
         Estimator for learning potential outcomes at each leaf.
         Will be trained on features, controls and one hot encoded treatments (concatenated).
         If different models per treatment arm are desired, see the :class:`.MultiModelWrapper`
         helper class. The model(s) must implement `fit` and `predict` methods.
 
-    propensity_model_final : estimator, optional (default=None)
+    propensity_model_final : estimator, default None
         Model for estimating propensity of treatment at at prediction time.
         Will be trained on features and controls (concatenated). Must implement `fit` and `predict_proba` methods.
         If parameter is set to ``None``, it defaults to the value of `propensity_model` parameter.
 
-    model_Y_final : estimator, optional (default=None)
+    model_Y_final : estimator, default None
         Estimator for learning potential outcomes at prediction time.
         Will be trained on features, controls and one hot encoded treatments (concatenated).
         If different models per treatment arm are desired, see the :class:`.MultiModelWrapper`
         helper class. The model(s) must implement `fit` and `predict` methods.
         If parameter is set to ``None``, it defaults to the value of `model_Y` parameter.
 
-    n_jobs : int, optional (default=-1)
+    n_jobs : int, default -1
         The number of jobs to run in parallel for both :meth:`fit` and :meth:`effect`.
         ``-1`` means using all processors. Since OrthoForest methods are
         computationally heavy, it is recommended to set `n_jobs` to -1.
 
-    random_state : int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+    random_state : int, :class:`~numpy.random.mtrand.RandomState` instance or None, default None
         If int, random_state is the seed used by the random number generator;
         If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
         If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
diff --git a/econml/sklearn_extensions/ensemble.py b/econml/sklearn_extensions/ensemble.py
index f9c17c505..25bc58c6d 100644
--- a/econml/sklearn_extensions/ensemble.py
+++ b/econml/sklearn_extensions/ensemble.py
@@ -161,23 +161,23 @@ class SubsampledHonestForest(ForestRegressor, RegressorMixin):
 
     Parameters
     ----------
-    n_estimators : integer, optional (default=100)
+    n_estimators : integer, default 100
         The total number of trees in the forest. The forest consists of a
         forest of sqrt(n_estimators) sub-forests, where each sub-forest
         contains sqrt(n_estimators) trees.
 
-    criterion : string, optional (default="mse")
+    criterion : string, default "mse"
         The function to measure the quality of a split. Supported criteria
         are "mse" for the mean squared error, which is equal to variance
         reduction as feature selection criterion, and "mae" for the mean
         absolute error.
 
-    max_depth : integer or None, optional (default=None)
+    max_depth : integer or None, default None
         The maximum depth of the tree. If None, then nodes are expanded until
         all leaves are pure or until all leaves contain less than
         min_samples_split samples.
 
-    min_samples_split : int, float, optional (default=2)
+    min_samples_split : int, float, default 2
         The minimum number of splitting samples required to split an internal node.
 
         - If int, then consider `min_samples_split` as the minimum number.
@@ -185,7 +185,7 @@ class SubsampledHonestForest(ForestRegressor, RegressorMixin):
           `ceil(min_samples_split * n_samples)` are the minimum
           number of samples for each split.
 
-    min_samples_leaf : int, float, optional (default=1)
+    min_samples_leaf : int, float, default 1
         The minimum number of samples required to be at a leaf node.
         A split point at any depth will only be considered if it leaves at
         least ``min_samples_leaf`` splitting samples in each of the left and
@@ -199,7 +199,7 @@ class SubsampledHonestForest(ForestRegressor, RegressorMixin):
           `ceil(min_samples_leaf * n_samples)` are the minimum
           number of samples for each node.
 
-    min_weight_fraction_leaf : float, optional (default=0.)
+    min_weight_fraction_leaf : float, default 0.
         The minimum weighted fraction of the sum total of weights (of all
         splitting samples) required to be at a leaf node. Samples have
         equal weight when sample_weight is not provided. After construction
@@ -207,7 +207,7 @@ class SubsampledHonestForest(ForestRegressor, RegressorMixin):
         of the estimation samples contained in each leaf node is at
         least min_weight_fraction_leaf
 
-    max_features : int, float, string or None, optional (default="auto")
+    max_features : int, float, string or None, default "auto"
         The number of features to consider when looking for the best split:
 
         - If int, then consider `max_features` features at each split.
@@ -223,12 +223,12 @@ class SubsampledHonestForest(ForestRegressor, RegressorMixin):
         valid partition of the node samples is found, even if it requires to
         effectively inspect more than ``max_features`` features.
 
-    max_leaf_nodes : int or None, optional (default=None)
+    max_leaf_nodes : int or None, default None
         Grow trees with ``max_leaf_nodes`` in best-first fashion.
         Best nodes are defined as relative reduction in impurity.
         If None then unlimited number of leaf nodes.
 
-    min_impurity_decrease : float, optional (default=0.)
+    min_impurity_decrease : float, default 0.
         A node will be split if this split induces a decrease of the impurity
         greater than or equal to this value.
 
@@ -244,7 +244,7 @@ class SubsampledHonestForest(ForestRegressor, RegressorMixin):
         ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
         if ``sample_weight`` is passed.
 
-    subsample_fr : float or 'auto', optional (default='auto')
+    subsample_fr : float or 'auto', default 'auto'
         The fraction of the half-samples that are used on each tree. Each tree
         will be built on subsample_fr * n_samples/2.
 
@@ -254,27 +254,27 @@ class SubsampledHonestForest(ForestRegressor, RegressorMixin):
 
         which is sufficient to guarantee asympotitcally valid inference.
 
-    honest : boolean, optional (default=True)
+    honest : boolean, default True
         Whether to use honest trees, i.e. half of the samples are used for
         creating the tree structure and the other half for the estimation at
         the leafs. If False, then all samples are used for both parts.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int or None, default None
         The number of jobs to run in parallel for both `fit` and `predict`.
         `None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance or None, default None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    verbose : int, optional (default=0)
+    verbose : int, default 0
         Controls the verbosity when fitting and predicting.
 
-    warm_start : bool, optional (default=False)
+    warm_start : bool, default False
         When set to ``True``, reuse the solution of the previous call to fit
         and add more estimators to the ensemble, otherwise, just fit a whole
         new forest. See :term:`the Glossary <warm_start>`.
@@ -643,7 +643,7 @@ def _inference(self, X, stderr=False):
         ----------
         X : (n, d_x) array
             The target samples
-        stderr : bool, optional (default=2)
+        stderr : bool, default 2
             Whether to return stderr information for each prediction
 
         Returns
diff --git a/econml/utilities.py b/econml/utilities.py
index f583a0b21..14fd6c178 100644
--- a/econml/utilities.py
+++ b/econml/utilities.py
@@ -722,7 +722,7 @@ class WeightedModelWrapper:
     model_instance : estimator
         Model that requires weights.
 
-    sample_type : string, optional (default=`weighted`)
+    sample_type : string, default `weighted`
         Method for adding weights to the model. `weighted` for linear regression models
         where the weights can be incorporated in the matrix multiplication,
         `sampled` for other models. `sampled` samples the training set according
@@ -869,9 +869,9 @@ class StatsModelsLinearRegression(BaseEstimator):
 
     Parameters
     ----------
-    fit_intercept : bool (optional, default=True)
+    fit_intercept : bool, default ``True``
         Whether to fit an intercept in this model
-    fit_args : dict (optional, default=`{}`)
+    fit_args : dict, default ``{}``
         The statsmodels-style fit arguments; keys can include 'cov_type'
     """