diff --git a/botorch/optim/__init__.py b/botorch/optim/__init__.py
index a9d9619469..d6a0c3e08e 100644
--- a/botorch/optim/__init__.py
+++ b/botorch/optim/__init__.py
@@ -23,7 +23,6 @@
     LogLinearHomotopySchedule,
 )
 from botorch.optim.initializers import initialize_q_batch, initialize_q_batch_nonneg
-from botorch.optim.numpy_converter import module_to_array, set_params_with_array
 from botorch.optim.optimize import (
     gen_batch_initial_conditions,
     optimize_acqf,
@@ -51,9 +50,7 @@
     "optimize_acqf_discrete_local_search",
     "optimize_acqf_mixed",
     "optimize_acqf_homotopy",
-    "module_to_array",
     "scipy_minimize",
-    "set_params_with_array",
     "torch_minimize",
     "ExpMAStoppingCriterion",
     "FixedHomotopySchedule",
diff --git a/botorch/optim/fit.py b/botorch/optim/fit.py
index 098d678666..b6bd4f99b7 100644
--- a/botorch/optim/fit.py
+++ b/botorch/optim/fit.py
@@ -9,21 +9,7 @@
 from __future__ import annotations
 
 from functools import partial
-from itertools import filterfalse
-from time import monotonic
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Iterator,
-    List,
-    Optional,
-    Pattern,
-    Sequence,
-    Set,
-    Tuple,
-    Union,
-)
+from typing import Any, Callable, Dict, Optional, Sequence, Set, Tuple, Union
 from warnings import warn
 
 from botorch.exceptions.warnings import OptimizationWarning
@@ -34,25 +20,11 @@
     scipy_minimize,
     torch_minimize,
 )
-from botorch.optim.numpy_converter import (
-    _scipy_objective_and_grad,
-    module_to_array,
-    set_params_with_array,
-)
 from botorch.optim.stopping import ExpMAStoppingCriterion
-from botorch.optim.utils import (
-    _filter_kwargs,
-    _get_extra_mll_args,
-    get_name_filter,
-    get_parameters_and_bounds,
-    TorchAttr,
-)
-from botorch.optim.utils.model_utils import get_parameters
+from botorch.optim.utils import get_parameters_and_bounds, TorchAttr
 from botorch.utils.types import DEFAULT
 from gpytorch.mlls.marginal_log_likelihood import MarginalLogLikelihood
-from gpytorch.settings import fast_computations
 from numpy import ndarray
-from scipy.optimize import Bounds, minimize
 from torch import Tensor
 from torch.nn import Module
 from torch.optim.adam import Adam
@@ -200,199 +172,3 @@ def fit_gpytorch_mll_torch(
         callback=callback,
         timeout_sec=timeout_sec,
     )
-
-
-def fit_gpytorch_scipy(
-    mll: MarginalLogLikelihood,
-    bounds: Optional[Dict[str, Tuple[Optional[float], Optional[float]]]] = None,
-    method: str = "L-BFGS-B",
-    options: Optional[Dict[str, Any]] = None,
-    track_iterations: bool = False,
-    approx_mll: bool = False,
-    scipy_objective: TScipyObjective = _scipy_objective_and_grad,
-    module_to_array_func: TModToArray = module_to_array,
-    module_from_array_func: TArrayToMod = set_params_with_array,
-    **kwargs: Any,
-) -> Tuple[MarginalLogLikelihood, Dict[str, Union[float, List[OptimizationResult]]]]:
-    r"""Legacy method for scipy-based fitting of gpytorch models.
-
-    The model and likelihood in mll must already be in train mode. This method requires
-    that the model has `train_inputs` and `train_targets`.
-
-    Args:
-        mll: MarginalLogLikelihood to be maximized.
-        bounds: A dictionary mapping parameter names to tuples of lower and upper
-            bounds.
-        method: Solver type, passed along to scipy.optimize.minimize.
-        options: Dictionary of solver options, passed along to scipy.optimize.minimize.
-        approx_mll: If True, use gpytorch's approximate MLL computation. This is
-            disabled by default since the stochasticity is an issue for
-            determistic optimizers). Enabling this is only recommended when
-            working with large training data sets (n>2000).
-
-    Returns:
-        2-element tuple containing
-        - MarginalLogLikelihood with parameters optimized in-place.
-        - Dictionary with the following key/values:
-        "fopt": Best mll value.
-        "wall_time": Wall time of fitting.
-        "iterations": List of OptimizationResult objects with information on each
-        iteration. If track_iterations is False, will be empty.
-        "OptimizeResult": The result returned by `scipy.optim.minimize`.
-    """
-    warn(
-        "`fit_gpytorch_scipy` is marked for deprecation, consider using "
-        "`scipy_minimize` or its model fitting helper `fit_gpytorch_mll_scipy`.",
-        DeprecationWarning,
-    )
-    start_time = monotonic()
-    iterations: List[OptimizationResult] = []
-
-    options = {} if options is None else options.copy()
-    exclude: Iterator[Union[Pattern, str]] = options.pop("exclude", None)
-    if exclude:
-        exclude, _ = zip(  # get the qualified names of excluded parameters
-            *filterfalse(get_name_filter(exclude), mll.named_parameters())
-        )
-
-    x0, property_dict, bounds = module_to_array_func(
-        module=mll, exclude=exclude, bounds=bounds
-    )
-    if bounds is not None:
-        bounds = Bounds(lb=bounds[0], ub=bounds[1], keep_feasible=True)
-
-    def wrapper(x: ndarray) -> Tuple[float, ndarray]:
-        with fast_computations(log_prob=approx_mll):
-            return scipy_objective(x=x, mll=mll, property_dict=property_dict)
-
-    def store_iteration(xk):
-        iterations.append(
-            OptimizationResult(
-                step=len(iterations),
-                fval=float(wrapper(xk)[0]),
-                status=OptimizationStatus.RUNNING,
-                runtime=monotonic() - start_time,
-            )
-        )
-
-    result = minimize(
-        wrapper,
-        x0,
-        bounds=bounds,
-        method=method,
-        jac=True,
-        options=options,
-        callback=store_iteration if track_iterations else None,
-    )
-
-    info_dict = {
-        "fopt": float(result.fun),
-        "wall_time": monotonic() - start_time,
-        "iterations": iterations,
-        "OptimizeResult": result,
-    }
-    if not result.success:
-        try:
-            # Some result.message are bytes
-            msg = result.message.decode("ascii")
-        except AttributeError:
-            # Others are str
-            msg = result.message
-        warn(
-            f"Fitting failed with the optimizer reporting '{msg}'", OptimizationWarning
-        )
-
-    # Set to optimum
-    mll = module_from_array_func(mll, result.x, property_dict)
-    return mll, info_dict
-
-
-def fit_gpytorch_torch(
-    mll: MarginalLogLikelihood,
-    bounds: Optional[Dict[str, Tuple[Optional[float], Optional[float]]]] = None,
-    optimizer_cls: Optimizer = Adam,
-    options: Optional[Dict[str, Any]] = None,
-    track_iterations: bool = False,
-    approx_mll: bool = False,
-) -> Tuple[MarginalLogLikelihood, Dict[str, Union[float, List[OptimizationResult]]]]:
-    r"""Legacy method for torch-based fitting of gpytorch models.
-
-    The model and likelihood in mll must already be in train mode.
-    Note: this method requires that the model has `train_inputs` and `train_targets`.
-
-    Args:
-        mll: MarginalLogLikelihood to be maximized.
-        bounds: An optional dictionary mapping parameter names to tuples
-            of lower and upper bounds. Bounds specified here take precedence
-            over bounds on the same parameters specified in the constraints
-            registered with the module.
-        optimizer_cls: Torch optimizer to use. Must not require a closure.
-        options: options for model fitting. Relevant options will be passed to
-            the `optimizer_cls`. Additionally, options can include: "disp"
-            to specify whether to display model fitting diagnostics and "maxiter"
-            to specify the maximum number of iterations.
-
-    Returns:
-        2-element tuple containing
-        - mll with parameters optimized in-place.
-        - Dictionary with the following key/values:
-        "fopt": Best mll value.
-        "wall_time": Wall time of fitting.
-        "iterations": List of OptimizationResult objects with information on each
-        iteration. If track_iterations is False, will be empty.
-
-    Example:
-        >>> gp = SingleTaskGP(train_X, train_Y)
-        >>> mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
-        >>> mll.train()
-        >>> fit_gpytorch_torch(mll)
-        >>> mll.eval()
-    """
-    warn(
-        "`fit_gpytorch_torch` is marked for deprecation, consider using "
-        "`torch_minimize` or its model fitting helper `fit_gpytorch_mll_torch`.",
-        DeprecationWarning,
-    )
-    _options = {"maxiter": 100, "disp": True, "lr": 0.05}
-    _options.update(options or {})
-    exclude = _options.pop("exclude", None)
-    parameters = get_parameters(
-        mll,
-        requires_grad=True,
-        name_filter=None if exclude is None else get_name_filter(exclude),
-    )
-
-    optimizer = optimizer_cls(
-        params=list(parameters.values()), **_filter_kwargs(optimizer_cls, **_options)
-    )
-    iterations: List[OptimizationResult] = []
-    stopping_criterion = ExpMAStoppingCriterion(
-        **_filter_kwargs(ExpMAStoppingCriterion, **_options)
-    )
-
-    def closure() -> Tuple[Tensor, Tuple[Tensor, ...]]:
-        optimizer.zero_grad()
-        with fast_computations(log_prob=approx_mll):
-            out = mll.model(*mll.model.train_inputs)
-            loss = -mll(out, mll.model.train_targets, *_get_extra_mll_args(mll)).sum()
-            loss.backward()
-
-        return loss, tuple(param.grad for param in parameters.values())
-
-    def store_iteration(parameters: Dict[str, Tensor], result: OptimizationResult):
-        iterations.append(result)
-
-    result = fit_gpytorch_mll_torch(
-        mll=mll,
-        closure=closure,
-        bounds=bounds,
-        parameters=parameters,
-        optimizer=optimizer,
-        stopping_criterion=stopping_criterion,
-        callback=store_iteration if track_iterations else None,
-    )
-    return mll, {
-        "fopt": result.fval,
-        "wall_time": result.runtime,
-        "iterations": iterations,
-    }
diff --git a/botorch/optim/numpy_converter.py b/botorch/optim/numpy_converter.py
index 91aa103d3a..e910c3df62 100644
--- a/botorch/optim/numpy_converter.py
+++ b/botorch/optim/numpy_converter.py
@@ -4,198 +4,4 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-r"""
-A converter that simplifies using numpy-based optimizers with generic torch
-`nn.Module` classes. This enables using a `scipy.optim.minimize` optimizer
-for optimizing module parameters.
-"""
-
-from __future__ import annotations
-
-from collections import OrderedDict
-from math import inf
-from numbers import Number
-from typing import Dict, List, Optional, Set, Tuple
-from warnings import warn
-
-import numpy as np
-import torch
-from botorch.optim.utils import (
-    _get_extra_mll_args,
-    _handle_numerical_errors,
-    get_name_filter,
-    get_parameters_and_bounds,
-    TorchAttr,
-)
-from gpytorch.mlls import MarginalLogLikelihood
-from torch.nn import Module
-
-
-def module_to_array(
-    module: Module,
-    bounds: Optional[Dict[str, Tuple[Optional[float], Optional[float]]]] = None,
-    exclude: Optional[Set[str]] = None,
-) -> Tuple[np.ndarray, Dict[str, TorchAttr], Optional[np.ndarray]]:
-    r"""Extract named parameters from a module into a numpy array.
-
-    Only extracts parameters with requires_grad, since it is meant for optimizing.
-
-    Args:
-        module: A module with parameters. May specify parameter constraints in
-            a `named_parameters_and_constraints` method.
-        bounds: A dictionary mapping parameter names t lower and upper bounds.
-            of lower and upper bounds. Bounds specified here take precedence
-            over bounds on the same parameters specified in the constraints
-            registered with the module.
-        exclude: A list of parameter names that are to be excluded from extraction.
-
-    Returns:
-        3-element tuple containing
-        - The parameter values as a numpy array.
-        - An ordered dictionary with the name and tensor attributes of each
-        parameter.
-        - A `2 x n_params` numpy array with lower and upper bounds if at least
-        one constraint is finite, and None otherwise.
-
-    Example:
-        >>> mll = ExactMarginalLogLikelihood(model.likelihood, model)
-        >>> parameter_array, property_dict, bounds_out = module_to_array(mll)
-    """
-    warn(
-        "`module_to_array` is marked for deprecation, consider using "
-        "`get_parameters_and_bounds`, `get_parameters_as_ndarray_1d`, or "
-        "`get_bounds_as_ndarray` instead.",
-        DeprecationWarning,
-    )
-    param_dict, bounds_dict = get_parameters_and_bounds(
-        module=module,
-        name_filter=None if exclude is None else get_name_filter(exclude),
-        requires_grad=True,
-    )
-    if bounds is not None:
-        bounds_dict.update(bounds)
-
-    # Record tensor metadata and read parameter values to the tape
-    param_tape: List[Number] = []
-    property_dict = OrderedDict()
-    with torch.no_grad():
-        for name, param in param_dict.items():
-            property_dict[name] = TorchAttr(param.shape, param.dtype, param.device)
-            param_tape.extend(param.view(-1).cpu().double().tolist())
-
-    # Extract lower and upper bounds
-    start = 0
-    bounds_np = None
-    params_np = np.asarray(param_tape)
-    for name, param in param_dict.items():
-        numel = param.numel()
-        if name in bounds_dict:
-            for row, bound in enumerate(bounds_dict[name]):
-                if bound is None:
-                    continue
-
-                if torch.is_tensor(bound):
-                    if (bound == (2 * row - 1) * inf).all():
-                        continue
-                    bound = bound.detach().cpu()
-
-                elif bound == (2 * row - 1) * inf:
-                    continue
-
-                if bounds_np is None:
-                    bounds_np = np.full((2, len(params_np)), ((-inf,), (inf,)))
-
-                bounds_np[row, start : start + numel] = bound
-        start += numel
-
-    return params_np, property_dict, bounds_np
-
-
-def set_params_with_array(
-    module: Module, x: np.ndarray, property_dict: Dict[str, TorchAttr]
-) -> Module:
-    r"""Set module parameters with values from numpy array.
-
-    Args:
-        module: Module with parameters to be set
-        x: Numpy array with parameter values
-        property_dict: Dictionary of parameter names and torch attributes as
-            returned by module_to_array.
-
-    Returns:
-        Module: module with parameters updated in-place.
-
-    Example:
-        >>> mll = ExactMarginalLogLikelihood(model.likelihood, model)
-        >>> parameter_array, property_dict, bounds_out = module_to_array(mll)
-        >>> parameter_array += 0.1  # perturb parameters (for example only)
-        >>> mll = set_params_with_array(mll, parameter_array,  property_dict)
-    """
-    warn(
-        "`_set_params_with_array` is marked for deprecation, consider using "
-        "`set_parameters_from_ndarray_1d` instead.",
-        DeprecationWarning,
-    )
-    param_dict = OrderedDict(module.named_parameters())
-    start_idx = 0
-    for p_name, attrs in property_dict.items():
-        # Construct the new tensor
-        if len(attrs.shape) == 0:  # deal with scalar tensors
-            end_idx = start_idx + 1
-            new_data = torch.tensor(
-                x[start_idx], dtype=attrs.dtype, device=attrs.device
-            )
-        else:
-            end_idx = start_idx + np.prod(attrs.shape)
-            new_data = torch.tensor(
-                x[start_idx:end_idx], dtype=attrs.dtype, device=attrs.device
-            ).view(*attrs.shape)
-        start_idx = end_idx
-        # Update corresponding parameter in-place. Disable autograd to update.
-        param_dict[p_name].requires_grad_(False)
-        param_dict[p_name].copy_(new_data)
-        param_dict[p_name].requires_grad_(True)
-    return module
-
-
-def _scipy_objective_and_grad(
-    x: np.ndarray, mll: MarginalLogLikelihood, property_dict: Dict[str, TorchAttr]
-) -> Tuple[float, np.ndarray]:
-    r"""Get objective and gradient in format that scipy expects.
-
-    Args:
-        x: The (flattened) input parameters.
-        mll: The MarginalLogLikelihood module to evaluate.
-        property_dict: The property dictionary required to "unflatten" the input
-            parameter vector, as generated by `module_to_array`.
-
-    Returns:
-        2-element tuple containing
-
-        - The objective value.
-        - The gradient of the objective.
-    """
-    warn("`_scipy_objective_and_grad` is marked for deprecation.", DeprecationWarning)
-    mll = set_params_with_array(mll, x, property_dict)
-    train_inputs, train_targets = mll.model.train_inputs, mll.model.train_targets
-    mll.zero_grad()
-    try:  # catch linear algebra errors in gpytorch
-        output = mll.model(*train_inputs)
-        args = [output, train_targets] + _get_extra_mll_args(mll)
-        loss = -mll(*args).sum()
-    except RuntimeError as e:
-        return _handle_numerical_errors(error=e, x=x)
-    loss.backward()
-
-    i = 0
-    param_dict = OrderedDict(mll.named_parameters())
-    grad = np.zeros(sum([tattr.shape.numel() for tattr in property_dict.values()]))
-    for p_name in property_dict:
-        t = param_dict[p_name]
-        size = t.numel()
-        if t.requires_grad and t.grad is not None:
-            grad[i : i + size] = t.grad.detach().view(-1).cpu().double().clone().numpy()
-        i += size
-
-    mll.zero_grad()
-    return loss.item(), grad
+# File will be removed in next commit.
diff --git a/test/optim/test_fit.py b/test/optim/test_fit.py
index 0c608d22a2..69775f3385 100644
--- a/test/optim/test_fit.py
+++ b/test/optim/test_fit.py
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import math
-import re
 from unittest.mock import MagicMock, patch
 from warnings import catch_warnings
 
@@ -235,226 +234,3 @@ def _test_fit_gpytorch_mll_torch(self, mll):
                     mll, closure=mock_closure, closure_kwargs={"ab": "cd"}
                 )
             mock_closure.assert_called_once_with(ab="cd")
-
-
-class TestFitGPyTorchScipy(BotorchTestCase):
-    def setUp(self) -> None:
-        super().setUp()
-        self.mlls = {}
-        with torch.random.fork_rng():
-            torch.manual_seed(0)
-            train_X = torch.linspace(0, 1, 10).unsqueeze(-1)
-            train_Y = torch.sin((2 * math.pi) * train_X)
-            train_Y = train_Y + 0.1 * torch.randn_like(train_Y)
-
-        model = SingleTaskGP(
-            train_X=train_X,
-            train_Y=train_Y,
-            input_transform=Normalize(d=1),
-            outcome_transform=Standardize(m=1),
-        )
-        self.mlls[SingleTaskGP, 1] = ExactMarginalLogLikelihood(model.likelihood, model)
-
-    def test_fit_gpytorch_scipy(self):
-        for mll in self.mlls.values():
-            for dtype in (torch.float32, torch.float64):
-                self._test_fit_gpytorch_scipy(mll.to(dtype=dtype))
-
-    def _test_fit_gpytorch_scipy(self, mll):
-        options = {"disp": False, "maxiter": 3, "maxfun": 2}
-        ckpt = {
-            k: TensorCheckpoint(v.detach().clone(), v.device, v.dtype)
-            for k, v in mll.state_dict().items()
-        }
-        with self.subTest("main"), module_rollback_ctx(mll, checkpoint=ckpt):
-            with catch_warnings(record=True) as ws, debug(True):
-                _, info_dict = fit.fit_gpytorch_scipy(
-                    mll, track_iterations=True, options=options
-                )
-
-            # Test only parameters requiring gradients have changed
-            self.assertTrue(
-                all(
-                    param.equal(ckpt[name][0]) != param.requires_grad
-                    for name, param in mll.named_parameters()
-                )
-            )
-
-            # Test maxiter warning message
-            self.assertTrue(any("TOTAL NO. of" in str(w.message) for w in ws))
-            self.assertTrue(
-                any(issubclass(w.category, OptimizationWarning) for w in ws)
-            )
-
-            # Test iteration tracking
-            self.assertLessEqual(len(info_dict["iterations"]), options["maxiter"])
-            self.assertIsInstance(info_dict["iterations"][0], OptimizationResult)
-            self.assertTrue("fopt" in info_dict)
-            self.assertTrue("wall_time" in info_dict)
-            self.assertEqual(sum(1 for w in ws if "TOTAL NO. of" in str(w.message)), 1)
-
-        # Test that user provided bounds and `exclude` argument are respected
-        exclude = "model.mean_module.constant", re.compile("raw_lengthscale$")
-        with self.subTest("bounds"), module_rollback_ctx(mll, checkpoint=ckpt):
-            fit.fit_gpytorch_scipy(
-                mll,
-                bounds={"likelihood.noise_covar.raw_noise": (123, 456)},
-                options={**options, "exclude": exclude},
-            )
-
-            self.assertTrue(
-                mll.likelihood.noise_covar.raw_noise >= 123
-                and mll.likelihood.noise_covar.raw_noise <= 456
-            )
-
-            for name, param in mll.named_parameters():
-                if (
-                    name
-                    in (
-                        "model.mean_module.constant",
-                        "model.covar_module.base_kernel.raw_lengthscale",
-                    )
-                    or not param.requires_grad
-                ):
-                    self.assertTrue(param.equal(ckpt[name][0]))
-                else:
-                    self.assertFalse(param.equal(ckpt[name][0]))
-
-        # Test use of `approx_mll` flag
-        with self.subTest("approx_mll"), module_rollback_ctx(mll, checkpoint=ckpt):
-            fit.fit_gpytorch_scipy(mll, approx_mll=True, options=options)
-            self.assertTrue(
-                all(
-                    param.equal(ckpt[name][0]) != param.requires_grad
-                    for name, param in mll.named_parameters()
-                )
-            )
-
-        # Test handling of scipy optimization failures and parameter assignments
-        mock_x = []
-        assignments = {}
-        for name, param in mll.named_parameters():
-            if not param.requires_grad:
-                continue  # pragma: no cover
-
-            values = assignments[name] = torch.rand_like(param)
-            mock_x.append(values.view(-1))
-
-        with module_rollback_ctx(mll, checkpoint=ckpt), patch.object(
-            fit, "minimize"
-        ) as mock_minimize:
-            mock_minimize.return_value = OptimizeResult(
-                x=torch.concat(mock_x).tolist(),
-                success=False,
-                status=0,
-                fun=float("nan"),
-                jac=None,
-                nfev=1,
-                njev=1,
-                nhev=1,
-                nit=1,
-                message="ABNORMAL_TERMINATION_IN_LNSRCH".encode(),
-            )
-            with catch_warnings(record=True) as ws, debug(True):
-                fit.fit_gpytorch_scipy(mll, options=options)
-
-            # Test that warning gets raised
-            self.assertTrue(
-                any("ABNORMAL_TERMINATION_IN_LNSRCH" in str(w.message) for w in ws)
-            )
-
-            # Test that parameter values get assigned correctly
-            self.assertTrue(
-                all(
-                    param.equal(assignments[name])
-                    for name, param in mll.named_parameters()
-                    if param.requires_grad
-                )
-            )
-
-
-class TestFitGPyTorchTorch(BotorchTestCase):
-    def setUp(self):
-        super().setUp()
-        self.mlls = {}
-        with torch.random.fork_rng():
-            torch.manual_seed(0)
-            train_X = torch.linspace(0, 1, 10).unsqueeze(-1)
-            train_Y = torch.sin((2 * math.pi) * train_X)
-            train_Y = train_Y + 0.1 * torch.randn_like(train_Y)
-
-        model = SingleTaskGP(
-            train_X=train_X,
-            train_Y=train_Y,
-            input_transform=Normalize(d=1),
-            outcome_transform=Standardize(m=1),
-        )
-        self.mlls[SingleTaskGP, 1] = ExactMarginalLogLikelihood(model.likelihood, model)
-
-    def test_fit_gpytorch_torch(self):
-        for mll in self.mlls.values():
-            for dtype in (torch.float32, torch.float64):
-                self._test_fit_gpytorch_torch(mll.to(dtype=dtype))
-
-    def _test_fit_gpytorch_torch(self, mll):
-        options = {"maxiter": 3}
-        ckpt = {
-            k: TensorCheckpoint(v.detach().clone(), v.device, v.dtype)
-            for k, v in mll.state_dict().items()
-        }
-        with self.subTest("main"), module_rollback_ctx(mll, checkpoint=ckpt):
-            with catch_warnings(record=True), debug(True):
-                _, info_dict = fit.fit_gpytorch_torch(
-                    mll, track_iterations=True, options=options
-                )
-
-            # Test only parameters requiring gradients have changed
-            self.assertTrue(
-                all(
-                    param.equal(ckpt[name][0]) != param.requires_grad
-                    for name, param in mll.named_parameters()
-                )
-            )
-
-            # Test iteration tracking
-            self.assertEqual(len(info_dict["iterations"]), options["maxiter"])
-            self.assertIsInstance(info_dict["iterations"][0], OptimizationResult)
-            self.assertTrue("fopt" in info_dict)
-            self.assertTrue("wall_time" in info_dict)
-
-        # Test that user provided bounds and `exclude` argument are respected
-        exclude = "model.mean_module.constant", re.compile("raw_lengthscale$")
-        with self.subTest("bounds"), module_rollback_ctx(mll, checkpoint=ckpt):
-            fit.fit_gpytorch_torch(
-                mll,
-                bounds={"likelihood.noise_covar.raw_noise": (123, 456)},
-                options={**options, "exclude": exclude},
-            )
-
-            self.assertTrue(
-                mll.likelihood.noise_covar.raw_noise >= 123
-                and mll.likelihood.noise_covar.raw_noise <= 456
-            )
-
-            for name, param in mll.named_parameters():
-                if (
-                    name
-                    in (
-                        "model.mean_module.constant",
-                        "model.covar_module.base_kernel.raw_lengthscale",
-                    )
-                    or not param.requires_grad
-                ):
-                    self.assertTrue(param.equal(ckpt[name][0]))
-                else:
-                    self.assertFalse(param.equal(ckpt[name][0]))
-
-        # Test use of `approx_mll` flag
-        with self.subTest("approx_mll"), module_rollback_ctx(mll, checkpoint=ckpt):
-            fit.fit_gpytorch_torch(mll, approx_mll=True, options=options)
-            self.assertTrue(
-                all(
-                    param.equal(ckpt[name][0]) != param.requires_grad
-                    for name, param in mll.named_parameters()
-                )
-            )
diff --git a/test/optim/test_numpy_converter.py b/test/optim/test_numpy_converter.py
index 380ca0ec76..4b87eb9e4d 100644
--- a/test/optim/test_numpy_converter.py
+++ b/test/optim/test_numpy_converter.py
@@ -3,269 +3,3 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-
-from math import pi
-from unittest.mock import MagicMock, patch
-from warnings import catch_warnings, simplefilter
-
-import numpy as np
-import torch
-from botorch.models import SingleTaskGP
-from botorch.optim import numpy_converter
-from botorch.optim.numpy_converter import (
-    _scipy_objective_and_grad,
-    module_to_array,
-    set_params_with_array,
-)
-from botorch.utils.testing import BotorchTestCase
-from gpytorch.constraints import GreaterThan
-from gpytorch.kernels.rbf_kernel import RBFKernel
-from gpytorch.likelihoods import GaussianLikelihood
-from gpytorch.means.constant_mean import ConstantMean
-from gpytorch.mlls.exact_marginal_log_likelihood import ExactMarginalLogLikelihood
-from gpytorch.models.exact_gp import ExactGP
-
-
-def _get_index(property_dict, parameter_name):
-    idx = 0
-    for p_name, ta in property_dict.items():
-        if p_name == parameter_name:
-            break
-        idx += ta.shape.numel()
-    return idx
-
-
-class TestModuleToArray(BotorchTestCase):
-    def test_basic(self):
-        for dtype in (torch.float, torch.double):
-            # get a test module
-            train_x = torch.tensor([[1.0, 2.0, 3.0]], device=self.device, dtype=dtype)
-            train_y = torch.tensor([4.0], device=self.device, dtype=dtype)
-            likelihood = GaussianLikelihood()
-            model = ExactGP(train_x, train_y, likelihood)
-            model.covar_module = RBFKernel(ard_num_dims=3)
-            model.mean_module = ConstantMean()
-            model.to(device=self.device, dtype=dtype)
-            mll = ExactMarginalLogLikelihood(likelihood, model)
-            # test the basic case
-            with catch_warnings():
-                simplefilter("ignore", category=DeprecationWarning)
-                x, pdict, bounds = module_to_array(module=mll)
-            self.assertTrue(np.array_equal(x, np.zeros(5)))
-            expected_sizes = {
-                "likelihood.noise_covar.raw_noise": torch.Size([1]),
-                "model.covar_module.raw_lengthscale": torch.Size([1, 3]),
-                "model.mean_module.raw_constant": torch.Size(),
-            }
-            self.assertEqual(set(pdict.keys()), set(expected_sizes.keys()))
-            for pname, val in pdict.items():
-                self.assertEqual(val.dtype, dtype)
-                self.assertEqual(val.shape, expected_sizes[pname])
-                self.assertEqual(val.device.type, self.device.type)
-            self.assertIsNone(bounds)
-
-    def test_exclude(self):
-        for dtype in (torch.float, torch.double):
-            # get a test module
-            train_x = torch.tensor([[1.0, 2.0, 3.0]], device=self.device, dtype=dtype)
-            train_y = torch.tensor([4.0], device=self.device, dtype=dtype)
-            likelihood = GaussianLikelihood()
-            model = ExactGP(train_x, train_y, likelihood)
-            model.covar_module = RBFKernel(ard_num_dims=3)
-            model.mean_module = ConstantMean()
-            model.to(device=self.device, dtype=dtype)
-            mll = ExactMarginalLogLikelihood(likelihood, model)
-            # test the basic case
-            with catch_warnings():
-                simplefilter("ignore", category=DeprecationWarning)
-                x, pdict, bounds = module_to_array(
-                    module=mll, exclude={"model.mean_module.raw_constant"}
-                )
-            self.assertTrue(np.array_equal(x, np.zeros(4)))
-            expected_sizes = {
-                "likelihood.noise_covar.raw_noise": torch.Size([1]),
-                "model.covar_module.raw_lengthscale": torch.Size([1, 3]),
-            }
-            self.assertEqual(set(pdict.keys()), set(expected_sizes.keys()))
-            for pname, val in pdict.items():
-                self.assertEqual(val.dtype, dtype)
-                self.assertEqual(val.shape, expected_sizes[pname])
-                self.assertEqual(val.device.type, self.device.type)
-            self.assertIsNone(bounds)
-
-    def test_manual_bounds(self):
-        for dtype in (torch.float, torch.double):
-            # get a test module
-            train_x = torch.tensor([[1.0, 2.0, 3.0]], device=self.device, dtype=dtype)
-            train_y = torch.tensor([4.0], device=self.device, dtype=dtype)
-            likelihood = GaussianLikelihood()
-            model = ExactGP(train_x, train_y, likelihood)
-            model.covar_module = RBFKernel(ard_num_dims=3)
-            model.mean_module = ConstantMean()
-            model.to(device=self.device, dtype=dtype)
-            mll = ExactMarginalLogLikelihood(likelihood, model)
-            # test the basic case
-            with catch_warnings():
-                simplefilter("ignore", category=DeprecationWarning)
-                x, pdict, bounds = module_to_array(
-                    module=mll,
-                    bounds={"model.covar_module.raw_lengthscale": (0.1, None)},
-                )
-            self.assertTrue(np.array_equal(x, np.zeros(5)))
-            expected_sizes = {
-                "likelihood.noise_covar.raw_noise": torch.Size([1]),
-                "model.covar_module.raw_lengthscale": torch.Size([1, 3]),
-                "model.mean_module.raw_constant": torch.Size(),
-            }
-            self.assertEqual(set(pdict.keys()), set(expected_sizes.keys()))
-            for pname, val in pdict.items():
-                self.assertEqual(val.dtype, dtype)
-                self.assertEqual(val.shape, expected_sizes[pname])
-                self.assertEqual(val.device.type, self.device.type)
-            lower_exp = np.full_like(x, 0.1)
-            for p in (
-                "likelihood.noise_covar.raw_noise",
-                "model.mean_module.raw_constant",
-            ):
-                lower_exp[_get_index(pdict, p)] = -np.inf
-            self.assertTrue(np.equal(bounds[0], lower_exp).all())
-            self.assertTrue(np.equal(bounds[1], np.full_like(x, np.inf)).all())
-
-            with catch_warnings():
-                simplefilter("ignore", category=DeprecationWarning)
-                x, pdict, bounds = module_to_array(
-                    module=mll,
-                    bounds={
-                        key: (-float("inf"), float("inf"))
-                        for key, _ in mll.named_parameters()
-                    },
-                )
-            self.assertIsNone(bounds)
-
-    def test_module_bounds(self):
-        for dtype in (torch.float, torch.double):
-            # get a test module
-            train_x = torch.tensor([[1.0, 2.0, 3.0]], device=self.device, dtype=dtype)
-            train_y = torch.tensor([4.0], device=self.device, dtype=dtype)
-            likelihood = GaussianLikelihood(
-                noise_constraint=GreaterThan(1e-5, transform=None)
-            )
-            model = ExactGP(train_x, train_y, likelihood)
-            model.covar_module = RBFKernel(ard_num_dims=3)
-            model.mean_module = ConstantMean()
-            model.to(device=self.device, dtype=dtype)
-            mll = ExactMarginalLogLikelihood(likelihood, model)
-            # test the basic case
-            with catch_warnings():
-                simplefilter("ignore", category=DeprecationWarning)
-                x, pdict, bounds = module_to_array(
-                    module=mll,
-                    bounds={"model.covar_module.raw_lengthscale": (0.1, None)},
-                )
-            self.assertTrue(np.array_equal(x, np.zeros(5)))
-            expected_sizes = {
-                "likelihood.noise_covar.raw_noise": torch.Size([1]),
-                "model.covar_module.raw_lengthscale": torch.Size([1, 3]),
-                "model.mean_module.raw_constant": torch.Size(),
-            }
-            self.assertEqual(set(pdict.keys()), set(expected_sizes.keys()))
-            for pname, val in pdict.items():
-                self.assertEqual(val.dtype, dtype)
-                self.assertEqual(val.shape, expected_sizes[pname])
-                self.assertEqual(val.device.type, self.device.type)
-            lower_exp = np.full_like(x, 0.1)
-            lower_exp[_get_index(pdict, "model.mean_module.raw_constant")] = -np.inf
-            lower_exp[_get_index(pdict, "likelihood.noise_covar.raw_noise")] = 1e-5
-            self.assertTrue(np.allclose(bounds[0], lower_exp))
-            self.assertTrue(np.equal(bounds[1], np.full_like(x, np.inf)).all())
-
-
-class TestSetParamsWithArray(BotorchTestCase):
-    def test_set_parameters(self):
-        for dtype in (torch.float, torch.double):
-            # get a test module
-            train_x = torch.tensor([[1.0, 2.0, 3.0]], device=self.device, dtype=dtype)
-            train_y = torch.tensor([4.0], device=self.device, dtype=dtype)
-            likelihood = GaussianLikelihood()
-            model = ExactGP(train_x, train_y, likelihood)
-            model.covar_module = RBFKernel(ard_num_dims=3)
-            model.mean_module = ConstantMean()
-            model.to(device=self.device, dtype=dtype)
-            mll = ExactMarginalLogLikelihood(likelihood, model)
-
-            with catch_warnings():
-                # Get parameters
-                simplefilter("ignore", category=DeprecationWarning)
-                x, pdict, bounds = module_to_array(module=mll)
-
-                # Set parameters
-                mll = set_params_with_array(
-                    mll, np.array([1.0, 2.0, 3.0, 4.0, 5.0]), pdict
-                )
-                z = dict(mll.named_parameters())
-            self.assertTrue(
-                torch.equal(
-                    z["likelihood.noise_covar.raw_noise"],
-                    torch.tensor([1.0], device=self.device, dtype=dtype),
-                )
-            )
-            self.assertTrue(
-                torch.equal(
-                    z["model.covar_module.raw_lengthscale"],
-                    torch.tensor([[2.0, 3.0, 4.0]], device=self.device, dtype=dtype),
-                )
-            )
-            self.assertTrue(
-                torch.equal(
-                    z["model.mean_module.raw_constant"],
-                    torch.tensor(5.0, device=self.device, dtype=dtype),
-                )
-            )
-
-            # Extract again
-            with catch_warnings():
-                simplefilter("ignore", category=DeprecationWarning)
-                x2, pdict2, bounds2 = module_to_array(module=mll)
-            self.assertTrue(np.array_equal(x2, np.array([1.0, 2.0, 3.0, 4.0, 5.0])))
-
-
-class TestScipyObjectiveAndGrad(BotorchTestCase):
-    def setUp(self) -> None:
-        super().setUp()
-        with torch.random.fork_rng():
-            torch.manual_seed(0)
-            train_X = torch.linspace(0, 1, 10).unsqueeze(-1)
-            train_Y = torch.sin((2 * pi) * train_X)
-            train_Y = train_Y + 0.1 * torch.randn_like(train_Y)
-
-        model = SingleTaskGP(train_X=train_X, train_Y=train_Y)
-        self.mll = ExactMarginalLogLikelihood(model.likelihood, model)
-
-    def test_scipy_objective_and_grad(self):
-        with catch_warnings():
-            simplefilter("ignore", category=DeprecationWarning)
-            x, property_dict, bounds = module_to_array(module=self.mll)
-            loss, grad = _scipy_objective_and_grad(x, self.mll, property_dict)
-
-        _dist = self.mll.model(*self.mll.model.train_inputs)
-        _loss = -self.mll(_dist, self.mll.model.train_targets)
-        _loss.sum().backward()
-        _grad = torch.concat(
-            [self.mll.get_parameter(name).grad.view(-1) for name in property_dict]
-        )
-        self.assertEqual(loss, _loss.detach().sum().item())
-        self.assertTrue(np.allclose(grad, _grad.detach().numpy()))
-
-        def _getter(*args, **kwargs):
-            raise RuntimeError("foo")
-
-        _handler = MagicMock()
-
-        with catch_warnings(), patch.multiple(
-            numpy_converter,
-            _get_extra_mll_args=_getter,
-            _handle_numerical_errors=_handler,
-        ):
-            simplefilter("ignore", category=DeprecationWarning)
-            _scipy_objective_and_grad(x, self.mll, property_dict)
-        self.assertEqual(_handler.call_count, 1)
diff --git a/test/optim/utils/test_model_utils.py b/test/optim/utils/test_model_utils.py
index e0bbe74458..525b5ae255 100644
--- a/test/optim/utils/test_model_utils.py
+++ b/test/optim/utils/test_model_utils.py
@@ -124,16 +124,26 @@ def setUp(self):
     def test_get_parameters(self):
         self.assertEqual(0, len(get_parameters(self.module, requires_grad=False)))
 
-        params = get_parameters(self.module)
-        self.assertEqual(1, len(params))
-        self.assertEqual(next(iter(params)), "noise_covar.raw_noise")
-        self.assertTrue(
-            self.module.noise_covar.raw_noise.equal(next(iter(params.values())))
-        )
+        # No name filter
+        for name_filter in [None, lambda x: "n" in x]:
+            with self.subTest("none filtered", name_filter=name_filter):
+                params = get_parameters(self.module, name_filter=name_filter)
+                self.assertEqual(1, len(params))
+                self.assertEqual(next(iter(params)), "noise_covar.raw_noise")
+                self.assertTrue(
+                    self.module.noise_covar.raw_noise.equal(next(iter(params.values())))
+                )
+
+        with self.subTest("all params filtered"):
+            params_filtered = get_parameters(self.module, lambda x: "z" in x)
+            self.assertEqual(params_filtered, {})
+
+            params_filtered = get_parameters(self.module, lambda x: "n" in x)
 
     def test_get_parameters_and_bounds(self):
         param_dict, bounds_dict = get_parameters_and_bounds(self.module)
-        self.assertTrue(1 == len(param_dict) == len(bounds_dict))
+        self.assertEqual(1, len(param_dict))
+        self.assertEqual(1, len(bounds_dict))
 
         name, bounds = next(iter(bounds_dict.items()))
         self.assertEqual(name, "noise_covar.raw_noise")
@@ -145,14 +155,15 @@ def test_get_parameters_and_bounds(self):
         )
         param_dict2, bounds_dict2 = get_parameters_and_bounds(mock_module)
         self.assertEqual(param_dict, param_dict2)
-        self.assertTrue(len(bounds_dict2) == 0)
+        self.assertEqual(len(bounds_dict2), 0)
 
 
 class TestGetNameFilter(BotorchTestCase):
-    def test_get_name_filter(self):
+    def test__get_name_filter__raises_on_invalid_pattern_type(self) -> None:
         with self.assertRaisesRegex(TypeError, "Expected `patterns` to contain"):
             get_name_filter(("foo", re.compile("bar"), 1))
 
+    def test__get_name_filter(self) -> None:
         names = ascii_lowercase
         name_filter = get_name_filter(iter(names[1::2]))
         self.assertEqual(names[::2], "".join(filter(name_filter, names)))
@@ -160,6 +171,9 @@ def test_get_name_filter(self):
         items = tuple(zip(names, range(len(names))))
         self.assertEqual(items[::2], tuple(filter(name_filter, items)))
 
+        self.assertTrue(name_filter("a"))
+        self.assertFalse(name_filter("b"))
+
 
 class TestSampleAllPriors(BotorchTestCase):
     def test_sample_all_priors(self):