diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index c251e89ef9..d64e4be310 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -6,6 +6,7 @@
 
 ### Maintenance
 
+- All occurances of `sd` as a parameter name have been renamed to `sigma`. `sd` will continue to function for backwards compatibility.
 - Made `BrokenPipeError` for parallel sampling more verbose on Windows.
 - Added the `broadcast_distribution_samples` function that helps broadcasting arrays of drawn samples, taking into account the requested `size` and the inferred distribution shape. This sometimes is needed by distributions that call several `rvs` separately within their `random` method, such as the `ZeroInflatedPoisson` (Fix issue #3310).
 - The `Wald`, `Kumaraswamy`, `LogNormal`, `Pareto`, `Cauchy`, `HalfCauchy`, `Weibull` and `ExGaussian` distributions `random` method used a hidden `_random` function that was written with scalars in mind. This could potentially lead to artificial correlations between random draws. Added shape guards and broadcasting of the distribution samples to prevent this (Similar to issue #3310).
diff --git a/docs/source/Advanced_usage_of_Theano_in_PyMC3.rst b/docs/source/Advanced_usage_of_Theano_in_PyMC3.rst
index 109faf0485..7b74e00fcd 100644
--- a/docs/source/Advanced_usage_of_Theano_in_PyMC3.rst
+++ b/docs/source/Advanced_usage_of_Theano_in_PyMC3.rst
@@ -32,7 +32,7 @@ be time consuming if the number of datasets is large)::
     data = theano.shared(observed_data[0])
     pm.Model() as model:
         mu = pm.Normal('mu', 0, 10)
-        pm.Normal('y', mu=mu, sd=1, observed=data)
+        pm.Normal('y', mu=mu, sigma=1, observed=data)
 
     # Generate one trace for each dataset
     traces = []
@@ -53,7 +53,7 @@ variable for our observations::
     x_shared = theano.shared(x)
 
     with pm.Model() as model:
-      coeff = pm.Normal('x', mu=0, sd=1)
+      coeff = pm.Normal('x', mu=0, sigma=1)
       logistic = pm.math.sigmoid(coeff * x_shared)
       pm.Bernoulli('obs', p=logistic, observed=y)
 
@@ -210,8 +210,8 @@ We can now define our model using this new op::
     tt_mu_from_theta = MuFromTheta()
 
     with pm.Model() as model:
-        theta = pm.HalfNormal('theta', sd=1)
+        theta = pm.HalfNormal('theta', sigma=1)
         mu = pm.Deterministic('mu', tt_mu_from_theta(theta))
-        pm.Normal('y', mu=mu, sd=0.1, observed=[0.2, 0.21, 0.3])
+        pm.Normal('y', mu=mu, sigma=0.1, observed=[0.2, 0.21, 0.3])
 
         trace = pm.sample()
diff --git a/docs/source/Probability_Distributions.rst b/docs/source/Probability_Distributions.rst
index bee53da8b5..2a4667db4e 100644
--- a/docs/source/Probability_Distributions.rst
+++ b/docs/source/Probability_Distributions.rst
@@ -12,7 +12,7 @@ For example, if we wish to define a particular variable as having a normal prior
 
     with pm.Model():
     
-        x = pm.Normal('x', mu=0, sd=1)
+        x = pm.Normal('x', mu=0, sigma=1)
         
 A variable requires at least a ``name`` argument, and zero or more model parameters, depending on the distribution. Parameter names vary by distribution, using conventional names wherever possible. The example above defines a scalar variable. To make a vector-valued variable, a ``shape`` argument should be provided; for example, a 3x3 matrix of beta random variables could be defined with:
 
diff --git a/docs/source/PyMC3_and_Theano.rst b/docs/source/PyMC3_and_Theano.rst
index 6fc2be21c7..90f3906edb 100644
--- a/docs/source/PyMC3_and_Theano.rst
+++ b/docs/source/PyMC3_and_Theano.rst
@@ -134,8 +134,8 @@ happens if we define a PyMC3 model. Let's look at a simple example::
     data = true_mu + np.random.randn(50)
 
     with pm.Model() as model:
-        mu = pm.Normal('mu', mu=0, sd=1)
-        y = pm.Normal('y', mu=mu, sd=1, observed=data)
+        mu = pm.Normal('mu', mu=0, sigma=1)
+        y = pm.Normal('y', mu=mu, sigma=1, observed=data)
 
 In this model we define two variables: `mu` and `y`. The first is
 a free variable that we want to infer, the second is an observed
@@ -184,7 +184,7 @@ example::
     with pm.Model() as model:
         mu = pm.Normal('mu', 0, 1)
         sd = pm.HalfNormal('sd', 1)
-        y = pm.Normal('y', mu=mu, sd=sd, observed=data)
+        y = pm.Normal('y', mu=mu, sigma=sd, observed=data)
 
 is roughly equivalent to this::
 
@@ -213,4 +213,4 @@ theano operation on them::
         beta = pm.Normal('beta', 0, 1, shape=len(design_matrix))
         predict = tt.dot(design_matrix, beta)
         sd = pm.HalfCauchy('sd', beta=2.5)
-        pm.Normal('y', mu=predict, sd=sd, observed=data)
+        pm.Normal('y', mu=predict, sigma=sd, observed=data)
diff --git a/docs/source/api/bounds.rst b/docs/source/api/bounds.rst
index 42c1ae0895..0df043a0c9 100644
--- a/docs/source/api/bounds.rst
+++ b/docs/source/api/bounds.rst
@@ -39,27 +39,27 @@ specification of a bounded distribution should go within the model block::
 
     with pm.Model() as model:
         BoundedNormal = pm.Bound(pm.Normal, lower=0.0)
-        x = BoundedNormal('x', mu=1.0, sd=3.0)
+        x = BoundedNormal('x', mu=1.0, sigma=3.0)
 
 If the bound will be applied to a single variable in the model, it may be
 cleaner notationally to define both the bound and variable together. ::
 
     with model:
-        x = pm.Bound(pm.Normal, lower=0.0)('x', mu=1.0, sd=3.0)
+        x = pm.Bound(pm.Normal, lower=0.0)('x', mu=1.0, sigma=3.0)
 
 However, it is possible to create multiple different random variables
 that have the same bound applied to them::
 
     with model:
         BoundNormal = pm.Bound(pm.Normal, lower=0.0)
-        hyper_mu = BoundNormal("hyper_mu", mu=1, sd=0.5)
-        mu = BoundNormal("mu", mu=hyper_mu, sd=1)
+        hyper_mu = BoundNormal("hyper_mu", mu=1, sigma=0.5)
+        mu = BoundNormal("mu", mu=hyper_mu, sigma=1)
 
 Bounds can also be applied to a vector of random variables.  With the same
 ``BoundedNormal`` object we created previously we can write::
 
     with model:
-        x_vector = BoundedNormal('x_vector', mu=1.0, sd=3.0, shape=3)
+        x_vector = BoundedNormal('x_vector', mu=1.0, sigma=3.0, shape=3)
 
 Caveats
 #######
diff --git a/docs/source/developer_guide.rst b/docs/source/developer_guide.rst
index f8cfb3ea84..4261e3faa3 100644
--- a/docs/source/developer_guide.rst
+++ b/docs/source/developer_guide.rst
@@ -147,8 +147,8 @@ explicit about the conversion. For example:
 .. code:: python
 
     with pm.Model() as model:
-        z = pm.Normal('z', mu=0., sd=5.)             # ==> pymc3.model.FreeRV, or theano.tensor with logp
-        x = pm.Normal('x', mu=z, sd=1., observed=5.) # ==> pymc3.model.ObservedRV, also has logp properties
+        z = pm.Normal('z', mu=0., sigma=5.)             # ==> pymc3.model.FreeRV, or theano.tensor with logp
+        x = pm.Normal('x', mu=z, sigma=1., observed=5.) # ==> pymc3.model.ObservedRV, also has logp properties
     x.logp({'z': 2.5})                               # ==> -4.0439386
     model.logp({'z': 2.5})                           # ==> -6.6973152
 
@@ -308,7 +308,7 @@ a model:
 .. code:: python
 
     with pm.Model() as m:
-        x = pm.Normal('x', mu=0., sd=1.)
+        x = pm.Normal('x', mu=0., sigma=1.)
 
 
 Which is the same as doing:
@@ -317,7 +317,7 @@ Which is the same as doing:
 .. code:: python
 
     m = pm.Model()
-    x = m.Var('x', pm.Normal.dist(mu=0., sd=1.))
+    x = m.Var('x', pm.Normal.dist(mu=0., sigma=1.))
 
 
 Both with the same output:
@@ -457,7 +457,7 @@ transformation <https://docs.pymc.io/notebooks/api_quickstart.html?highlight=cha
 
 .. code:: python
 
-    z = pm.Lognormal.dist(mu=0., sd=1., transform=tr.Log)
+    z = pm.Lognormal.dist(mu=0., sigma=1., transform=tr.Log)
     z.transform           # ==> pymc3.distributions.transforms.Log
 
 
@@ -1051,14 +1051,14 @@ we get error (even worse, wrong answer with silent error):
     with pm.Model() as m:
         mu = pm.Normal('mu', 0., 1., shape=(5, 1))
         sd = pm.HalfNormal('sd', 5., shape=(1, 10))
-        pm.Normal('x', mu=mu, sd=sd, observed=np.random.randn(2, 5, 10))
+        pm.Normal('x', mu=mu, sigma=sd, observed=np.random.randn(2, 5, 10))
         trace = pm.sample_prior_predictive(100)
 
     trace['x'].shape # ==> should be (100, 2, 5, 10), but get (100, 5, 10)
 
 .. code:: python
 
-    pm.Normal.dist(mu=np.zeros(2), sd=1).random(size=(10, 4)) # ==> ERROR
+    pm.Normal.dist(mu=np.zeros(2), sigma=1).random(size=(10, 4)) # ==> ERROR
 
 There are also other error related random sample generation (e.g.,
 `Mixture is currently
diff --git a/docs/source/history.rst b/docs/source/history.rst
index 1f124cf6c8..51366a8541 100644
--- a/docs/source/history.rst
+++ b/docs/source/history.rst
@@ -109,8 +109,8 @@ Models are defined using a context manager (``with`` statement). The model is sp
     with Model() as bioassay_model:
 
         # Prior distributions for latent variables
-        alpha = Normal('alpha', 0, sd=100)
-        beta = Normal('beta', 0, sd=100)
+        alpha = Normal('alpha', 0, sigma=100)
+        beta = Normal('beta', 0, sigma=100)
 
         # Linear combinations of parameters
         theta = invlogit(alpha + beta*dose)
diff --git a/docs/source/index.rst b/docs/source/index.rst
index bcb0a936ae..b62aabbc06 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -23,11 +23,11 @@
 
     X, y = linear_training_data()
     with pm.Model() as linear_model:
-        weights = pm.Normal('weights', mu=0, sd=1)
+        weights = pm.Normal('weights', mu=0, sigma=1)
         noise = pm.Gamma('noise', alpha=2, beta=1)
         y_observed = pm.Normal('y_observed',
                     mu=X.dot(weights),
-                    sd=noise,
+                    sigma=noise,
                     observed=y)
 
         prior = pm.sample_prior_predictive()
diff --git a/docs/source/notebooks/AR.ipynb b/docs/source/notebooks/AR.ipynb
index fc9e48b796..8f084c4b0b 100644
--- a/docs/source/notebooks/AR.ipynb
+++ b/docs/source/notebooks/AR.ipynb
@@ -177,8 +177,8 @@
    "source": [
     "tau = 1.0\n",
     "with pm.Model() as ar1:\n",
-    "    beta = pm.Normal('beta', mu=0, sd=tau)\n",
-    "    data = pm.AR('y', beta, sd=1.0, observed=y)\n",
+    "    beta = pm.Normal('beta', mu=0, sigma=tau)\n",
+    "    data = pm.AR('y', beta, sigma=1.0, observed=y)\n",
     "    trace = pm.sample(1000, cores=4)\n",
     "    \n",
     "pm.traceplot(trace);"
@@ -303,8 +303,8 @@
    ],
    "source": [
     "with pm.Model() as ar2:\n",
-    "    beta = pm.Normal('beta', mu=0, sd=tau, shape=2)\n",
-    "    data = pm.AR('y', beta, sd=1.0, observed=y)\n",
+    "    beta = pm.Normal('beta', mu=0, sigma=tau, shape=2)\n",
+    "    data = pm.AR('y', beta, sigma=1.0, observed=y)\n",
     "    trace = pm.sample(1000, cores=4)\n",
     "    \n",
     "pm.traceplot(trace);"
@@ -362,9 +362,9 @@
    ],
    "source": [
     "with pm.Model() as ar2:\n",
-    "    beta = pm.Normal('beta', mu=0, sd=tau)\n",
+    "    beta = pm.Normal('beta', mu=0, sigma=tau)\n",
     "    beta2 = pm.Uniform('beta2')\n",
-    "    data = pm.AR('y', [beta, beta2], sd=1.0, observed=y)\n",
+    "    data = pm.AR('y', [beta, beta2], sigma=1.0, observed=y)\n",
     "    trace = pm.sample(1000, tune=1000, cores=4)\n",
     "\n",
     "pm.traceplot(trace);"
diff --git a/docs/source/notebooks/BEST.ipynb b/docs/source/notebooks/BEST.ipynb
index b325236ae0..cbf855fde7 100644
--- a/docs/source/notebooks/BEST.ipynb
+++ b/docs/source/notebooks/BEST.ipynb
@@ -128,8 +128,8 @@
     "μ_s = y.value.std() * 2\n",
     "\n",
     "with pm.Model() as model:\n",
-    "    group1_mean = pm.Normal('group1_mean', μ_m, sd=μ_s)\n",
-    "    group2_mean = pm.Normal('group2_mean', μ_m, sd=μ_s)"
+    "    group1_mean = pm.Normal('group1_mean', μ_m, sigma=μ_s)\n",
+    "    group2_mean = pm.Normal('group2_mean', μ_m, sigma=μ_s)"
    ]
   },
   {
diff --git a/docs/source/notebooks/Diagnosing_biased_Inference_with_Divergences.ipynb b/docs/source/notebooks/Diagnosing_biased_Inference_with_Divergences.ipynb
index cf6138dd88..0133b81f5d 100644
--- a/docs/source/notebooks/Diagnosing_biased_Inference_with_Divergences.ipynb
+++ b/docs/source/notebooks/Diagnosing_biased_Inference_with_Divergences.ipynb
@@ -147,10 +147,10 @@
    "outputs": [],
    "source": [
     "with pm.Model() as Centered_eight:\n",
-    "    mu = pm.Normal('mu', mu=0, sd=5)\n",
+    "    mu = pm.Normal('mu', mu=0, sigma=5)\n",
     "    tau = pm.HalfCauchy('tau', beta=5)\n",
-    "    theta = pm.Normal('theta', mu=mu, sd=tau, shape=J)\n",
-    "    obs = pm.Normal('obs', mu=theta, sd=sigma, observed=y)"
+    "    theta = pm.Normal('theta', mu=mu, sigma=tau, shape=J)\n",
+    "    obs = pm.Normal('obs', mu=theta, sigma=sigma, observed=y)"
    ]
   },
   {
@@ -1321,11 +1321,11 @@
    "outputs": [],
    "source": [
     "with pm.Model() as NonCentered_eight:\n",
-    "    mu = pm.Normal('mu', mu=0, sd=5)\n",
+    "    mu = pm.Normal('mu', mu=0, sigma=5)\n",
     "    tau = pm.HalfCauchy('tau', beta=5)\n",
-    "    theta_tilde = pm.Normal('theta_t', mu=0, sd=1, shape=J)\n",
+    "    theta_tilde = pm.Normal('theta_t', mu=0, sigma=1, shape=J)\n",
     "    theta = pm.Deterministic('theta', mu + tau * theta_tilde)\n",
-    "    obs = pm.Normal('obs', mu=theta, sd=sigma, observed=y)"
+    "    obs = pm.Normal('obs', mu=theta, sigma=sigma, observed=y)"
    ]
   },
   {
diff --git a/docs/source/notebooks/Euler-Maruyama_and_SDEs.ipynb b/docs/source/notebooks/Euler-Maruyama_and_SDEs.ipynb
index bf5f4e71ce..e2cd49ec02 100644
--- a/docs/source/notebooks/Euler-Maruyama_and_SDEs.ipynb
+++ b/docs/source/notebooks/Euler-Maruyama_and_SDEs.ipynb
@@ -249,7 +249,7 @@
     "    xh = EulerMaruyama('xh', dt, lin_sde, (lam, ), shape=N, testval=x_t)\n",
     "    \n",
     "    # predicted observation\n",
-    "    zh = pm.Normal('zh', mu=xh, sd=5e-3, observed=z_t)"
+    "    zh = pm.Normal('zh', mu=xh, sigma=5e-3, observed=z_t)"
    ]
   },
   {
@@ -629,7 +629,7 @@
     "    ah = pm.Uniform('ah', lower=0.5, upper=1.5)\n",
     "    mh = pm.Uniform('mh', lower=0.0, upper=1.0)\n",
     "    xyh = EulerMaruyama('xyh', dt, osc_sde, (τh, ah), shape=xys.shape, testval=xys)\n",
-    "    zh = pm.Normal('zh', mu=mh * xyh[:, 0] + (1 - mh) * xyh[:, 1], sd=0.1, observed=zs)"
+    "    zh = pm.Normal('zh', mu=mh * xyh[:, 0] + (1 - mh) * xyh[:, 1], sigma=0.1, observed=zs)"
    ]
   },
   {
diff --git a/docs/source/notebooks/GLM-hierarchical-advi-minibatch.ipynb b/docs/source/notebooks/GLM-hierarchical-advi-minibatch.ipynb
index 83afce39f7..f58be21d09 100644
--- a/docs/source/notebooks/GLM-hierarchical-advi-minibatch.ipynb
+++ b/docs/source/notebooks/GLM-hierarchical-advi-minibatch.ipynb
@@ -70,9 +70,9 @@
    "source": [
     "with pm.Model() as hierarchical_model:\n",
     "    # Hyperpriors for group nodes\n",
-    "    mu_a = pm.Normal('mu_alpha', mu=0., sd=100**2)\n",
+    "    mu_a = pm.Normal('mu_alpha', mu=0., sigma=100**2)\n",
     "    sigma_a = pm.Uniform('sigma_alpha', lower=0, upper=100)\n",
-    "    mu_b = pm.Normal('mu_beta', mu=0., sd=100**2)\n",
+    "    mu_b = pm.Normal('mu_beta', mu=0., sigma=100**2)\n",
     "    sigma_b = pm.Uniform('sigma_beta', lower=0, upper=100)"
    ]
   },
@@ -93,9 +93,9 @@
    "source": [
     "with hierarchical_model:\n",
     "    \n",
-    "    a = pm.Normal('alpha', mu=mu_a, sd=sigma_a, shape=n_counties)\n",
+    "    a = pm.Normal('alpha', mu=mu_a, sigma=sigma_a, shape=n_counties)\n",
     "    # Intercept for each county, distributed around group mean mu_a\n",
-    "    b = pm.Normal('beta', mu=mu_b, sd=sigma_b, shape=n_counties)"
+    "    b = pm.Normal('beta', mu=mu_b, sigma=sigma_b, shape=n_counties)"
    ]
   },
   {
@@ -139,7 +139,7 @@
     "    eps = pm.Uniform('eps', lower=0, upper=100) \n",
     "    \n",
     "    # Data likelihood\n",
-    "    radon_like = pm.Normal('radon_like', mu=radon_est, sd=eps, observed=log_radon_t, total_size=len(data))"
+    "    radon_like = pm.Normal('radon_like', mu=radon_est, sigma=eps, observed=log_radon_t, total_size=len(data))"
    ]
   },
   {
@@ -238,13 +238,13 @@
     "# Inference button (TM)!\n",
     "with pm.Model():\n",
     "\n",
-    "    mu_a = pm.Normal('mu_alpha', mu=0., sd=100**2)\n",
+    "    mu_a = pm.Normal('mu_alpha', mu=0., sigma=100**2)\n",
     "    sigma_a = pm.Uniform('sigma_alpha', lower=0, upper=100)\n",
-    "    mu_b = pm.Normal('mu_beta', mu=0., sd=100**2)\n",
+    "    mu_b = pm.Normal('mu_beta', mu=0., sigma=100**2)\n",
     "    sigma_b = pm.Uniform('sigma_beta', lower=0, upper=100)\n",
     "    \n",
-    "    a = pm.Normal('alpha', mu=mu_a, sd=sigma_a, shape=n_counties)\n",
-    "    b = pm.Normal('beta', mu=mu_b, sd=sigma_b, shape=n_counties)\n",
+    "    a = pm.Normal('alpha', mu=mu_a, sigma=sigma_a, shape=n_counties)\n",
+    "    b = pm.Normal('beta', mu=mu_b, sigma=sigma_b, shape=n_counties)\n",
     "    \n",
     "    # Model error\n",
     "    eps = pm.Uniform('eps', lower=0, upper=100)\n",
@@ -252,7 +252,7 @@
     "    radon_est = a[county_idx] + b[county_idx] * data.floor.values\n",
     "    \n",
     "    radon_like = pm.Normal(\n",
-    "        'radon_like', mu=radon_est, sd=eps, observed=data.log_radon.values)\n",
+    "        'radon_like', mu=radon_est, sigma=eps, observed=data.log_radon.values)\n",
     "    \n",
     "    step = pm.NUTS(scaling=approx.cov.eval(), is_cov=True)\n",
     "    hierarchical_trace = pm.sample(2000, step, start=approx.sample()[0], progressbar=True)"
diff --git a/docs/source/notebooks/GLM-hierarchical.ipynb b/docs/source/notebooks/GLM-hierarchical.ipynb
index b3582a8221..e37c495ec4 100644
--- a/docs/source/notebooks/GLM-hierarchical.ipynb
+++ b/docs/source/notebooks/GLM-hierarchical.ipynb
@@ -220,8 +220,8 @@
     "with pm.Model() as unpooled_model:\n",
     "    \n",
     "    # Independent parameters for each county\n",
-    "    a = pm.Normal('a', 0, sd=100, shape=n_counties)\n",
-    "    b = pm.Normal('b', 0, sd=100, shape=n_counties)\n",
+    "    a = pm.Normal('a', 0, sigma=100, shape=n_counties)\n",
+    "    b = pm.Normal('b', 0, sigma=100, shape=n_counties)\n",
     "    \n",
     "    # Model error\n",
     "    eps = pm.HalfCauchy('eps', 5)\n",
@@ -233,7 +233,7 @@
     "    radon_est = a[county_idx] + b[county_idx]*data.floor.values\n",
     "    \n",
     "    # Data likelihood\n",
-    "    y = pm.Normal('y', radon_est, sd=eps, observed=data.log_radon)\n",
+    "    y = pm.Normal('y', radon_est, sigma=eps, observed=data.log_radon)\n",
     "    "
    ]
   },
@@ -283,18 +283,18 @@
    "source": [
     "with pm.Model() as hierarchical_model:\n",
     "    # Hyperpriors for group nodes\n",
-    "    mu_a = pm.Normal('mu_a', mu=0., sd=100**2)\n",
+    "    mu_a = pm.Normal('mu_a', mu=0., sigma=100**2)\n",
     "    sigma_a = pm.HalfCauchy('sigma_a', 5)\n",
-    "    mu_b = pm.Normal('mu_b', mu=0., sd=100**2)\n",
+    "    mu_b = pm.Normal('mu_b', mu=0., sigma=100**2)\n",
     "    sigma_b = pm.HalfCauchy('sigma_b', 5)\n",
     "    \n",
     "    # Intercept for each county, distributed around group mean mu_a\n",
     "    # Above we just set mu and sd to a fixed value while here we\n",
     "    # plug in a common group distribution for all a and b (which are\n",
     "    # vectors of length n_counties).\n",
-    "    a = pm.Normal('a', mu=mu_a, sd=sigma_a, shape=n_counties)\n",
+    "    a = pm.Normal('a', mu=mu_a, sigma=sigma_a, shape=n_counties)\n",
     "    # Intercept for each county, distributed around group mean mu_a\n",
-    "    b = pm.Normal('b', mu=mu_b, sd=sigma_b, shape=n_counties)\n",
+    "    b = pm.Normal('b', mu=mu_b, sigma=sigma_b, shape=n_counties)\n",
     "    \n",
     "    # Model error\n",
     "    eps = pm.HalfCauchy('eps', 5)\n",
@@ -302,7 +302,7 @@
     "    radon_est = a[county_idx] + b[county_idx] * data.floor.values\n",
     "    \n",
     "    # Data likelihood\n",
-    "    radon_like = pm.Normal('radon_like', mu=radon_est, sd=eps, observed=data.log_radon)"
+    "    radon_like = pm.Normal('radon_like', mu=radon_est, sigma=eps, observed=data.log_radon)"
    ]
   },
   {
diff --git a/docs/source/notebooks/GLM-linear.ipynb b/docs/source/notebooks/GLM-linear.ipynb
index 25e9f0f93d..9b7a4bbd1f 100644
--- a/docs/source/notebooks/GLM-linear.ipynb
+++ b/docs/source/notebooks/GLM-linear.ipynb
@@ -190,12 +190,12 @@
     "with Model() as model: # model specifications in PyMC3 are wrapped in a with-statement\n",
     "    # Define priors\n",
     "    sigma = HalfCauchy('sigma', beta=10, testval=1.)\n",
-    "    intercept = Normal('Intercept', 0, sd=20)\n",
-    "    x_coeff = Normal('x', 0, sd=20)\n",
+    "    intercept = Normal('Intercept', 0, sigma=20)\n",
+    "    x_coeff = Normal('x', 0, sigma=20)\n",
     "    \n",
     "    # Define likelihood\n",
     "    likelihood = Normal('y', mu=intercept + x_coeff * x, \n",
-    "                        sd=sigma, observed=y)\n",
+    "                        sigma=sigma, observed=y)\n",
     "    \n",
     "    # Inference!\n",
     "    trace = sample(3000, cores=2) # draw 3000 posterior samples using NUTS sampling"
diff --git a/docs/source/notebooks/GLM-model-selection.ipynb b/docs/source/notebooks/GLM-model-selection.ipynb
index e1a2caa48c..720cce2734 100644
--- a/docs/source/notebooks/GLM-model-selection.ipynb
+++ b/docs/source/notebooks/GLM-model-selection.ipynb
@@ -188,7 +188,7 @@
     "\n",
     "            print('\\nRunning: {}'.format(nm))\n",
     "            pm.glm.GLM.from_formula(fml, df,\n",
-    "                                    priors={'Intercept':pm.Normal.dist(mu=0, sd=100)},\n",
+    "                                    priors={'Intercept':pm.Normal.dist(mu=0, sigma=100)},\n",
     "                                    family=pm.glm.families.Normal())\n",
     "\n",
     "            traces[nm] = pm.sample(2000)\n",
@@ -455,15 +455,15 @@
    "source": [
     "with pm.Model() as mdl_ols:  \n",
     "    ## define Normal priors to give Ridge regression\n",
-    "    b0 = pm.Normal('b0', mu=0, sd=100)\n",
-    "    b1 = pm.Normal('b1', mu=0, sd=100)\n",
+    "    b0 = pm.Normal('b0', mu=0, sigma=100)\n",
+    "    b1 = pm.Normal('b1', mu=0, sigma=100)\n",
     " \n",
     "    ## define Linear model\n",
     "    yest = b0 + b1 * df_lin['x']\n",
     "\n",
     "    ## define Normal likelihood with HalfCauchy noise (fat tails, equiv to HalfT 1DoF)\n",
     "    sigma_y = pm.HalfCauchy('sigma_y', beta=10)\n",
-    "    likelihood = pm.Normal('likelihood', mu=yest, sd=sigma_y, observed=df_lin['y'])\n",
+    "    likelihood = pm.Normal('likelihood', mu=yest, sigma=sigma_y, observed=df_lin['y'])\n",
     "\n",
     "    traces_ols = pm.sample(2000)"
    ]
diff --git a/docs/source/notebooks/GLM-poisson-regression.ipynb b/docs/source/notebooks/GLM-poisson-regression.ipynb
index b58fbebabf..335a2a5acb 100644
--- a/docs/source/notebooks/GLM-poisson-regression.ipynb
+++ b/docs/source/notebooks/GLM-poisson-regression.ipynb
@@ -564,10 +564,10 @@
     "with pm.Model() as mdl_fish:\n",
     "    \n",
     "    # define priors, weakly informative Normal\n",
-    "    b0 = pm.Normal('b0_intercept', mu=0, sd=10)\n",
-    "    b1 = pm.Normal('b1_alcohol[T.True]', mu=0, sd=10)\n",
-    "    b2 = pm.Normal('b2_nomeds[T.True]', mu=0, sd=10)\n",
-    "    b3 = pm.Normal('b3_alcohol[T.True]:nomeds[T.True]', mu=0, sd=10)\n",
+    "    b0 = pm.Normal('b0_intercept', mu=0, sigma=10)\n",
+    "    b1 = pm.Normal('b1_alcohol[T.True]', mu=0, sigma=10)\n",
+    "    b2 = pm.Normal('b2_nomeds[T.True]', mu=0, sigma=10)\n",
+    "    b3 = pm.Normal('b3_alcohol[T.True]:nomeds[T.True]', mu=0, sigma=10)\n",
     "    \n",
     "    # define linear model and exp link function\n",
     "    theta = (b0 +\n",
diff --git a/docs/source/notebooks/GLM-robust-with-outlier-detection.ipynb b/docs/source/notebooks/GLM-robust-with-outlier-detection.ipynb
index 2e4cd74460..aa7184343b 100644
--- a/docs/source/notebooks/GLM-robust-with-outlier-detection.ipynb
+++ b/docs/source/notebooks/GLM-robust-with-outlier-detection.ipynb
@@ -238,8 +238,8 @@
     "with pm.Model() as mdl_ols:\n",
     "    \n",
     "    ## Define weakly informative Normal priors to give Ridge regression\n",
-    "    b0 = pm.Normal('b0_intercept', mu=0, sd=1)\n",
-    "    b1 = pm.Normal('b1_slope', mu=0, sd=1)\n",
+    "    b0 = pm.Normal('b0_intercept', mu=0, sigma=1)\n",
+    "    b1 = pm.Normal('b1_slope', mu=0, sigma=1)\n",
     " \n",
     "    ## Define linear model\n",
     "    yest = b0 + b1 * dfhoggs['x']\n",
@@ -249,7 +249,7 @@
     "                            dtype=theano.config.floatX), name='sigma_y')\n",
     "\n",
     "    ## Define Normal likelihood\n",
-    "    likelihood = pm.Normal('likelihood', mu=yest, sd=sigma_y, observed=dfhoggs['y'])"
+    "    likelihood = pm.Normal('likelihood', mu=yest, sigma=sigma_y, observed=dfhoggs['y'])"
    ]
   },
   {
@@ -351,8 +351,8 @@
     "with pm.Model() as mdl_studentt:\n",
     "    \n",
     "    ## Define weakly informative Normal priors to give Ridge regression\n",
-    "    b0 = pm.Normal('b0_intercept', mu=0, sd=1)\n",
-    "    b1 = pm.Normal('b1_slope', mu=0, sd=1)\n",
+    "    b0 = pm.Normal('b0_intercept', mu=0, sigma=1)\n",
+    "    b1 = pm.Normal('b1_slope', mu=0, sigma=1)\n",
     " \n",
     "    ## Define linear model\n",
     "    yest = b0 + b1 * dfhoggs['x']\n",
@@ -365,7 +365,7 @@
     "    nu = pm.Uniform('nu', lower=1, upper=100)\n",
     "\n",
     "    ## Define Student T likelihood\n",
-    "    likelihood = pm.StudentT('likelihood', mu=yest, sd=sigma_y, nu=nu,\n",
+    "    likelihood = pm.StudentT('likelihood', mu=yest, sigma=sigma_y, nu=nu,\n",
     "                             observed=dfhoggs['y'])\n"
    ]
   },
@@ -486,15 +486,15 @@
     "with pm.Model() as mdl_signoise:\n",
     "    \n",
     "    ## Define informative Normal priors to give Ridge regression\n",
-    "    b0 = pm.Normal('b0_intercept', mu=0, sd=1, testval=pm.floatX(0.1))\n",
-    "    b1 = pm.Normal('b1_slope', mu=0, sd=1, testval=pm.floatX(1.))\n",
+    "    b0 = pm.Normal('b0_intercept', mu=0, sigma=1, testval=pm.floatX(0.1))\n",
+    "    b1 = pm.Normal('b1_slope', mu=0, sigma=1, testval=pm.floatX(1.))\n",
     " \n",
     "    ## Define linear model\n",
     "    yest_in = b0 + b1 * dfhoggs['x']\n",
     "\n",
     "    ## Define weakly informative priors for the mean and variance of outliers\n",
-    "    yest_out = pm.Normal('yest_out', mu=0, sd=10, testval=pm.floatX(1.))\n",
-    "    sigma_y_out = pm.HalfNormal('sigma_y_out', sd=10, testval=pm.floatX(1.))\n",
+    "    yest_out = pm.Normal('yest_out', mu=0, sigma=10, testval=pm.floatX(1.))\n",
+    "    sigma_y_out = pm.HalfNormal('sigma_y_out', sigma=10, testval=pm.floatX(1.))\n",
     "\n",
     "    ## Define Bernoulli inlier / outlier flags according to a hyperprior \n",
     "    ## fraction of outliers, itself constrained to [0, .5] for symmetry\n",
@@ -507,8 +507,8 @@
     "    sigma_y_in = np.asarray(dfhoggs['sigma_y'], dtype=theano.config.floatX)\n",
     "    \n",
     "    # Set up normal distributions that give us the logp for both distributions\n",
-    "    inliers = pm.Normal.dist(mu=yest_in, sd=sigma_y_in).logp(yobs)\n",
-    "    outliers = pm.Normal.dist(mu=yest_out, sd=sigma_y_in + sigma_y_out).logp(yobs)\n",
+    "    inliers = pm.Normal.dist(mu=yest_in, sigma=sigma_y_in).logp(yobs)\n",
+    "    outliers = pm.Normal.dist(mu=yest_out, sigma=sigma_y_in + sigma_y_out).logp(yobs)\n",
     "    # Build custom likelihood, a potential will just be added to the logp and can thus function\n",
     "    # like a likelihood that we would add with the observed kwarg.\n",
     "    pm.Potential('obs', ((1 - is_outlier) * inliers).sum() + (is_outlier * outliers).sum())"
diff --git a/docs/source/notebooks/GLM-robust.ipynb b/docs/source/notebooks/GLM-robust.ipynb
index 18fbec26fa..df77caa275 100644
--- a/docs/source/notebooks/GLM-robust.ipynb
+++ b/docs/source/notebooks/GLM-robust.ipynb
@@ -209,7 +209,7 @@
     }
    ],
    "source": [
-    "normal_dist = pm.Normal.dist(mu=0, sd=1)\n",
+    "normal_dist = pm.Normal.dist(mu=0, sigma=1)\n",
     "t_dist = pm.StudentT.dist(mu=0, lam=1, nu=1)\n",
     "x_eval = np.linspace(-8, 8, 300)\n",
     "plt.plot(x_eval, theano.tensor.exp(normal_dist.logp(x_eval)).eval(), label='Normal', lw=2.)\n",
diff --git a/docs/source/notebooks/GLM-rolling-regression.ipynb b/docs/source/notebooks/GLM-rolling-regression.ipynb
index 3edf4de19e..bc0ae430b1 100644
--- a/docs/source/notebooks/GLM-rolling-regression.ipynb
+++ b/docs/source/notebooks/GLM-rolling-regression.ipynb
@@ -256,9 +256,9 @@
     "    sigma_alpha = pm.Exponential('sigma_alpha', 50.)\n",
     "    sigma_beta = pm.Exponential('sigma_beta', 50.)\n",
     "    \n",
-    "    alpha = pm.GaussianRandomWalk('alpha', sd=sigma_alpha, \n",
+    "    alpha = pm.GaussianRandomWalk('alpha', sigma=sigma_alpha, \n",
     "                                  shape=len(prices))\n",
-    "    beta = pm.GaussianRandomWalk('beta', sd=sigma_beta, \n",
+    "    beta = pm.GaussianRandomWalk('beta', sigma=sigma_beta, \n",
     "                                 shape=len(prices))       "
    ]
   },
@@ -282,10 +282,10 @@
     "    regression = alpha + beta * prices_zscored.GFI\n",
     "    \n",
     "    # Assume prices are Normally distributed, the mean comes from the regression.\n",
-    "    sd = pm.HalfNormal('sd', sd=.1)\n",
+    "    sd = pm.HalfNormal('sd', sigma=.1)\n",
     "    likelihood = pm.Normal('y', \n",
     "                           mu=regression, \n",
-    "                           sd=sd, \n",
+    "                           sigma=sd, \n",
     "                           observed=prices_zscored.GLD)"
    ]
   },
diff --git a/docs/source/notebooks/GLM.ipynb b/docs/source/notebooks/GLM.ipynb
index 57fd2a38e0..244e0bf1e3 100644
--- a/docs/source/notebooks/GLM.ipynb
+++ b/docs/source/notebooks/GLM.ipynb
@@ -86,7 +86,7 @@
     "with Model() as model:\n",
     "    lm = glm.LinearComponent.from_formula('y ~ x', data)\n",
     "    sigma = Uniform('sigma', 0, 20)\n",
-    "    y_obs = Normal('y_obs', mu=lm.y_est, sd=sigma, observed=y)\n",
+    "    y_obs = Normal('y_obs', mu=lm.y_est, sigma=sigma, observed=y)\n",
     "    trace = sample(2000, cores=2)\n",
     "\n",
     "plt.figure(figsize=(5, 5))\n",
@@ -289,14 +289,14 @@
    ],
    "source": [
     "with Model() as model_sat:\n",
-    "    grp_mean = Normal('grp_mean', mu=0, sd=10)\n",
+    "    grp_mean = Normal('grp_mean', mu=0, sigma=10)\n",
     "    grp_sd = Uniform('grp_sd', 0, 200)\n",
     "    # Define priors for intercept and regression coefficients.\n",
-    "    priors = {'Intercept': Normal.dist(mu=sat_data.sat_t.mean(), sd=sat_data.sat_t.std()),\n",
-    "          'spend': Normal.dist(mu=grp_mean, sd=grp_sd),\n",
-    "          'stu_tea_rat': Normal.dist(mu=grp_mean, sd=grp_sd),\n",
-    "          'salary': Normal.dist(mu=grp_mean, sd=grp_sd),\n",
-    "          'prcnt_take': Normal.dist(mu=grp_mean, sd=grp_sd)\n",
+    "    priors = {'Intercept': Normal.dist(mu=sat_data.sat_t.mean(), sigma=sat_data.sat_t.std()),\n",
+    "          'spend': Normal.dist(mu=grp_mean, sigma=grp_sd),\n",
+    "          'stu_tea_rat': Normal.dist(mu=grp_mean, sigma=grp_sd),\n",
+    "          'salary': Normal.dist(mu=grp_mean, sigma=grp_sd),\n",
+    "          'prcnt_take': Normal.dist(mu=grp_mean, sigma=grp_sd)\n",
     "    }\n",
     "    GLM.from_formula('sat_t ~ spend + stu_tea_rat + salary + prcnt_take', sat_data, priors=priors)\n",
     "    trace_sat = sample(2000, cores=2)"
@@ -350,10 +350,10 @@
    ],
    "source": [
     "with Model() as model_sat:\n",
-    "    grp_mean = Normal('grp_mean', mu=0, sd=10)\n",
+    "    grp_mean = Normal('grp_mean', mu=0, sigma=10)\n",
     "    grp_prec = Gamma('grp_prec', alpha=1, beta=.1, testval=1.)\n",
     "    slope = StudentT.dist(mu=grp_mean, lam=grp_prec, nu=1)\n",
-    "    intercept = Normal.dist(mu=sat_data.sat_t.mean(), sd=sat_data.sat_t.std())\n",
+    "    intercept = Normal.dist(mu=sat_data.sat_t.mean(), sigma=sat_data.sat_t.std())\n",
     "    GLM.from_formula('sat_t ~ spend + stu_tea_rat + salary + prcnt_take', sat_data,\n",
     "        priors={'Intercept': intercept, 'Regressor': slope})\n",
     "    trace_sat = sample(2000, cores=2)"
@@ -408,10 +408,10 @@
    "source": [
     "tdf_gain = 5.\n",
     "with Model() as model_sat:\n",
-    "    grp_mean = Normal('grp_mean', mu=0, sd=10)\n",
+    "    grp_mean = Normal('grp_mean', mu=0, sigma=10)\n",
     "    grp_prec = Gamma('grp_prec', alpha=1, beta=.1, testval=1.)\n",
     "    slope = StudentT.dist(mu=grp_mean, lam=grp_prec, nu=1) #grp_df)\n",
-    "    intercept = Normal.dist(mu=sat_data.sat_t.mean(), sd=sat_data.sat_t.std())\n",
+    "    intercept = Normal.dist(mu=sat_data.sat_t.mean(), sigma=sat_data.sat_t.std())\n",
     "    GLM.from_formula('sat_t ~ spend + stu_tea_rat + salary + prcnt_take', sat_data,\n",
     "                priors={'Intercept': intercept, 'Regressor': slope})\n",
     "\n",
@@ -739,7 +739,7 @@
    "source": [
     "with Model() as model_lasso:\n",
     "    # Define priors for intercept and regression coefficients.\n",
-    "    priors = {'Intercept': Normal.dist(mu=0, sd=50),\n",
+    "    priors = {'Intercept': Normal.dist(mu=0, sigma=50),\n",
     "              'Regressor': Laplace.dist(mu=0, b=0.05)\n",
     "    }\n",
     "    GLM.from_formula('male ~ height + weight', htwt_data, family=glm.families.Binomial(),\n",
diff --git a/docs/source/notebooks/GP-Kron.ipynb b/docs/source/notebooks/GP-Kron.ipynb
index d184942fcc..ab8050bffa 100644
--- a/docs/source/notebooks/GP-Kron.ipynb
+++ b/docs/source/notebooks/GP-Kron.ipynb
@@ -158,7 +158,7 @@
     "    # Set priors on the hyperparameters of the covariance\n",
     "    ls1  = pm.Gamma(\"ls1\", alpha=2, beta=2)\n",
     "    ls2  = pm.Gamma(\"ls2\", alpha=2, beta=2)\n",
-    "    eta = pm.HalfNormal(\"eta\", sd=2)\n",
+    "    eta = pm.HalfNormal(\"eta\", sigma=2)\n",
     "    \n",
     "    # Specify the covariance functions for each Xi\n",
     "    # Since the covariance is a product, only scale one of them by eta.\n",
@@ -170,7 +170,7 @@
     "    gp = pm.gp.MarginalKron(cov_funcs=[cov_x1, cov_x2])\n",
     "\n",
     "    # Set the prior on the variance for the Gaussian noise\n",
-    "    sigma = pm.HalfNormal(\"sigma\", sd=2)\n",
+    "    sigma = pm.HalfNormal(\"sigma\", sigma=2)\n",
     "    \n",
     "    # Place a GP prior over the function f.\n",
     "    y_ = gp.marginal_likelihood(\"y\", Xs=Xs, y=y, sigma=sigma)\n",
@@ -288,14 +288,14 @@
     "    # Set priors on the hyperparameters of the covariance\n",
     "    ls1 = pm.Gamma(\"ls1\", alpha=2, beta=2)\n",
     "    ls2 = pm.Gamma(\"ls2\", alpha=2, beta=2)\n",
-    "    eta = pm.HalfNormal(\"eta\", sd=2)\n",
+    "    eta = pm.HalfNormal(\"eta\", sigma=2)\n",
     "    \n",
     "    # Specify the covariance functions for each Xi\n",
     "    cov_x1 = pm.gp.cov.Matern52(1, ls=ls1)       \n",
     "    cov_x2 = eta**2 * pm.gp.cov.Cosine(1, ls=ls2)\n",
     "\n",
     "    # Set the prior on the variance for the Gaussian noise\n",
-    "    sigma = pm.HalfNormal(\"sigma\", sd=2)\n",
+    "    sigma = pm.HalfNormal(\"sigma\", sigma=2)\n",
     "\n",
     "    # Specify the GP.  The default mean function is `Zero`.\n",
     "    gp = pm.gp.LatentKron(cov_funcs=[cov_x1, cov_x2])\n",
@@ -303,7 +303,7 @@
     "    # Place a GP prior over the function f.\n",
     "    f = gp.prior(\"f\", Xs=Xs)\n",
     "    \n",
-    "    y_ = pm.Normal(\"y_\", mu=f, sd=sigma, observed=y)\n",
+    "    y_ = pm.Normal(\"y_\", mu=f, sigma=sigma, observed=y)\n",
     "\n",
     "with model:\n",
     "    tr = pm.sample(500, chains=1)"
diff --git a/docs/source/notebooks/GP-Latent.ipynb b/docs/source/notebooks/GP-Latent.ipynb
index 3e59149191..3929f07415 100644
--- a/docs/source/notebooks/GP-Latent.ipynb
+++ b/docs/source/notebooks/GP-Latent.ipynb
@@ -475,7 +475,7 @@
     "    # covariance function\n",
     "    ℓ = pm.Gamma(\"ℓ\", alpha=2, beta=2)\n",
     "    # informative, positive normal prior on the period \n",
-    "    η = pm.HalfNormal(\"η\", sd=5)\n",
+    "    η = pm.HalfNormal(\"η\", sigma=5)\n",
     "    cov = η**2 * pm.gp.cov.ExpQuad(1, ℓ)\n",
     "    \n",
     "    gp = pm.gp.Latent(cov_func=cov)\n",
diff --git a/docs/source/notebooks/GP-MaunaLoa.ipynb b/docs/source/notebooks/GP-MaunaLoa.ipynb
index c5beb58392..3f0d330ed4 100644
--- a/docs/source/notebooks/GP-MaunaLoa.ipynb
+++ b/docs/source/notebooks/GP-MaunaLoa.ipynb
@@ -754,7 +754,7 @@
     "priors = [\n",
     "    (\"ℓ_pdecay\",  pm.Gamma.dist(alpha=10, beta=0.075)),\n",
     "    (\"ℓ_psmooth\", pm.Gamma.dist(alpha=4,  beta=3)),\n",
-    "    (\"period\",    pm.Normal.dist(mu=1.0,  sd=0.05)),\n",
+    "    (\"period\",    pm.Normal.dist(mu=1.0,  sigma=0.05)),\n",
     "    (\"ℓ_med\",     pm.Gamma.dist(alpha=2,  beta=0.75)),\n",
     "    (\"α\",         pm.Gamma.dist(alpha=5,  beta=2)),\n",
     "    (\"ℓ_trend\",   pm.Gamma.dist(alpha=4,  beta=0.1)),\n",
@@ -866,8 +866,8 @@
     "    (\"η_per\",   pm.HalfCauchy.dist(beta=2)),\n",
     "    (\"η_med\",   pm.HalfCauchy.dist(beta=1.0)),\n",
     "    (\"η_trend\", pm.HalfCauchy.dist(beta=3)), # will use beta=2, but 2.2 is visible on plot\n",
-    "    (\"σ\",       pm.HalfNormal.dist(sd=0.25)),\n",
-    "    (\"η_noise\", pm.HalfNormal.dist(sd=0.5))]\n",
+    "    (\"σ\",       pm.HalfNormal.dist(sigma=0.25)),\n",
+    "    (\"η_noise\", pm.HalfNormal.dist(sigma=0.5))]\n",
     "\n",
     "colors = brewer['Paired'][5]\n",
     "\n",
@@ -943,7 +943,7 @@
     "    # yearly periodic component x long term trend\n",
     "    η_per = pm.HalfCauchy(\"η_per\", beta=2, testval=1.0)\n",
     "    ℓ_pdecay = pm.Gamma(\"ℓ_pdecay\", alpha=10, beta=0.075)\n",
-    "    period  = pm.Normal(\"period\", mu=1, sd=0.05)\n",
+    "    period  = pm.Normal(\"period\", mu=1, sigma=0.05)\n",
     "    ℓ_psmooth = pm.Gamma(\"ℓ_psmooth \", alpha=4, beta=3)\n",
     "    cov_seasonal = η_per**2 * pm.gp.cov.Periodic(1, period, ℓ_psmooth) \\\n",
     "                            * pm.gp.cov.Matern52(1, ℓ_pdecay)\n",
@@ -963,9 +963,9 @@
     "    gp_trend = pm.gp.Marginal(cov_func=cov_trend)   \n",
     "\n",
     "    # noise model\n",
-    "    η_noise = pm.HalfNormal(\"η_noise\", sd=0.5, testval=0.05)\n",
+    "    η_noise = pm.HalfNormal(\"η_noise\", sigma=0.5, testval=0.05)\n",
     "    ℓ_noise = pm.Gamma(\"ℓ_noise\", alpha=2, beta=4)\n",
-    "    σ  = pm.HalfNormal(\"σ\",  sd=0.25, testval=0.05)\n",
+    "    σ  = pm.HalfNormal(\"σ\",  sigma=0.25, testval=0.05)\n",
     "    cov_noise = η_noise**2 * pm.gp.cov.Matern32(1, ℓ_noise) +\\\n",
     "                pm.gp.cov.WhiteNoise(σ)\n",
     "\n",
diff --git a/docs/source/notebooks/GP-MaunaLoa2.ipynb b/docs/source/notebooks/GP-MaunaLoa2.ipynb
index 292ff2b878..639abcf81a 100644
--- a/docs/source/notebooks/GP-MaunaLoa2.ipynb
+++ b/docs/source/notebooks/GP-MaunaLoa2.ipynb
@@ -155,7 +155,7 @@
    "source": [
     "fig = plt.figure(figsize=(8,5))\n",
     "ax = plt.gca()\n",
-    "ax.hist(100 * pm.Normal.dist(mu=0.0, sd=0.02).random(size=10000), 100)\n",
+    "ax.hist(100 * pm.Normal.dist(mu=0.0, sigma=0.02).random(size=10000), 100)\n",
     "ax.set_xlabel(\"$\\Delta$ time (years)\")\n",
     "ax.set_title(\"time offset prior\");"
    ]
@@ -215,7 +215,7 @@
    "outputs": [],
    "source": [
     "with pm.Model() as model:\n",
-    "    η = pm.HalfNormal(\"η\", sd=5)\n",
+    "    η = pm.HalfNormal(\"η\", sigma=5)\n",
     "    ℓ = pm.Gamma(\"ℓ\", alpha=4, beta=2)\n",
     "    α = pm.Gamma(\"α\", alpha=3, beta=1)\n",
     "    cov = η**2 * pm.gp.cov.RatQuad(1, α, ℓ)\n",
@@ -224,11 +224,11 @@
     "    \n",
     "    # x location uncertainty\n",
     "    # - sd = 0.02 says the uncertainty on the point is about two years \n",
-    "    t_diff = pm.Normal(\"t_diff\", mu=0.0, sd=0.02, shape=len(t))\n",
+    "    t_diff = pm.Normal(\"t_diff\", mu=0.0, sigma=0.02, shape=len(t))\n",
     "    t_uncert = t_n - t_diff\n",
     "    \n",
     "    # white noise variance\n",
-    "    σ = pm.HalfNormal(\"σ\", sd=5, testval=1)\n",
+    "    σ = pm.HalfNormal(\"σ\", sigma=5, testval=1)\n",
     "    y_ = gp.marginal_likelihood(\"y\", X=t_uncert[:,None], y=y_n, noise=σ)"
    ]
   },
@@ -550,15 +550,15 @@
    "outputs": [],
    "source": [
     "with pm.Model() as model:\n",
-    "    η = pm.HalfNormal(\"η\", sd=2)\n",
+    "    η = pm.HalfNormal(\"η\", sigma=2)\n",
     "    ℓ = pm.Gamma(\"ℓ\", alpha=4, beta=2)\n",
     "    α = pm.Gamma(\"α\", alpha=3, beta=1)\n",
     "    cov = η**2 * pm.gp.cov.RatQuad(1, α, ℓ)\n",
     "    \n",
     "    # peicewise linear mean function\n",
-    "    k = pm.Normal(\"k\", mu=0, sd=1)\n",
-    "    m = pm.Normal(\"m\", mu=0, sd=1)\n",
-    "    delta = pm.Normal(\"delta\", mu=0, sd=5, shape=len(changepoints_t))\n",
+    "    k = pm.Normal(\"k\", mu=0, sigma=1)\n",
+    "    m = pm.Normal(\"m\", mu=0, sigma=1)\n",
+    "    delta = pm.Normal(\"delta\", mu=0, sigma=5, shape=len(changepoints_t))\n",
     "    mean = PeicewiseLinear(changepoints_t, k, m, delta)   \n",
     "\n",
     "    # include mean function in GP constructor\n",
@@ -566,11 +566,11 @@
     "    \n",
     "    # x location uncertainty\n",
     "    # - sd = 0.02 says the uncertainty on the point is about two years \n",
-    "    t_diff = pm.Normal(\"t_diff\", mu=0.0, sd=0.02, shape=len(t))\n",
+    "    t_diff = pm.Normal(\"t_diff\", mu=0.0, sigma=0.02, shape=len(t))\n",
     "    t_uncert = t_n - t_diff\n",
     "    \n",
     "    # white noise variance\n",
-    "    σ = pm.HalfNormal(\"σ\", sd=5)\n",
+    "    σ = pm.HalfNormal(\"σ\", sigma=5)\n",
     "    y_ = gp.marginal_likelihood(\"y\", X=t_uncert[:,None], y=y_n, noise=σ)"
    ]
   },
@@ -1029,17 +1029,17 @@
    "outputs": [],
    "source": [
     "with pm.Model() as model:\n",
-    "    η = pm.HalfNormal(\"η\", sd=5)\n",
+    "    η = pm.HalfNormal(\"η\", sigma=5)\n",
     "    ℓ = pm.Gamma(\"ℓ\", alpha=2, beta=0.1)\n",
     "    \n",
     "    # changepoint occurs near the year 1800, sometime between 1760, 1840\n",
-    "    x0 = pm.Normal(\"x0\", mu=18, sd=0.1)\n",
+    "    x0 = pm.Normal(\"x0\", mu=18, sigma=0.1)\n",
     "    # the change happens gradually\n",
-    "    a = pm.HalfNormal(\"a\", sd=2)\n",
+    "    a = pm.HalfNormal(\"a\", sigma=2)\n",
     "    # a constant for the \n",
-    "    c = pm.HalfNormal(\"c\", sd=3)\n",
+    "    c = pm.HalfNormal(\"c\", sigma=3)\n",
     "    # quadratic polynomial scale\n",
-    "    ηq = pm.HalfNormal(\"ηq\", sd=5)\n",
+    "    ηq = pm.HalfNormal(\"ηq\", sigma=5)\n",
     "    \n",
     "    cov1 = η**2 * pm.gp.cov.ExpQuad(1, ℓ)\n",
     "    cov2 = η**2 * pm.gp.cov.ExpQuad(1, ℓ) + ηq**2 * pm.gp.cov.Polynomial(1, x0, 2, c)\n",
@@ -1050,17 +1050,17 @@
     "    cov_c = sc_cov1 + sc_cov2\n",
     "    \n",
     "    # short term variation\n",
-    "    ηs = pm.HalfNormal(\"ηs\", sd=5)\n",
+    "    ηs = pm.HalfNormal(\"ηs\", sigma=5)\n",
     "    ℓs = pm.Gamma(\"ℓs\", alpha=2, beta=1)\n",
     "    cov_s = ηs**2 * pm.gp.cov.Matern52(1, ℓs)\n",
     "    \n",
     "    gp = pm.gp.Marginal(cov_func=cov_s + cov_c)\n",
     "\n",
-    "    t_diff = pm.Normal(\"t_diff\", mu=0.0, sd=0.02, shape=len(t))\n",
+    "    t_diff = pm.Normal(\"t_diff\", mu=0.0, sigma=0.02, shape=len(t))\n",
     "    t_uncert = t_n - t_diff\n",
     "    \n",
     "    # white noise variance\n",
-    "    σ = pm.HalfNormal(\"σ\", sd=5, testval=1)\n",
+    "    σ = pm.HalfNormal(\"σ\", sigma=5, testval=1)\n",
     "    y_ = gp.marginal_likelihood(\"y\", X=t_uncert[:,None], y=y_n, noise=σ)"
    ]
   },
@@ -1485,14 +1485,14 @@
     "    ℓc = pm.Gamma(\"ℓc\", alpha=10, beta=1)\n",
     "    \n",
     "    # changepoint occurs near the year 1800, sometime between 1760, 1840\n",
-    "    x0 = pm.Normal(\"x0\", mu=18, sd=0.1)\n",
+    "    x0 = pm.Normal(\"x0\", mu=18, sigma=0.1)\n",
     "    # the change happens gradually\n",
     "    a = pm.Gamma(\"a\", alpha=3, beta=1)\n",
     "    # constant offset\n",
-    "    c = pm.HalfNormal(\"c\", sd=2)\n",
+    "    c = pm.HalfNormal(\"c\", sigma=2)\n",
     "    \n",
     "    # quadratic polynomial scale\n",
-    "    ηq = pm.HalfNormal(\"ηq\", sd=1)\n",
+    "    ηq = pm.HalfNormal(\"ηq\", sigma=1)\n",
     "    ℓq = 2.0 # 2 century impact, since we only have 2 C of post IR data \n",
     "    \n",
     "    cov1 = ηc**2 * pm.gp.cov.ExpQuad(1, ℓc)\n",
@@ -1505,22 +1505,22 @@
     "    gp_c = pm.gp.Marginal(cov_func=sc_cov1 + sc_cov2)\n",
     "    \n",
     "    # short term variation\n",
-    "    ηs = pm.HalfNormal(\"ηs\", sd=3)\n",
+    "    ηs = pm.HalfNormal(\"ηs\", sigma=3)\n",
     "    ℓs = pm.Gamma(\"ℓs\", alpha=5, beta=100)\n",
     "    α = pm.Gamma(\"α\", alpha=4, beta=1)\n",
     "    cov_s = ηs**2 * pm.gp.cov.RatQuad(1, α, ℓs)\n",
     "    gp_s = pm.gp.Marginal(cov_func=cov_s)\n",
     "    \n",
     "    # medium term variation\n",
-    "    ηm = pm.HalfNormal(\"ηm\", sd=5)\n",
+    "    ηm = pm.HalfNormal(\"ηm\", sigma=5)\n",
     "    ℓm = pm.Gamma(\"ℓm\", alpha=2, beta=3)\n",
     "    cov_m = ηm**2 * pm.gp.cov.ExpQuad(1, ℓm)\n",
     "    gp_m = pm.gp.Marginal(cov_func=cov_m)\n",
     "    \n",
     "    ## periodic\n",
-    "    ηp = pm.HalfNormal(\"ηp\", sd=2)\n",
+    "    ηp = pm.HalfNormal(\"ηp\", sigma=2)\n",
     "    ℓp_decay = pm.Gamma(\"ℓp_decay\", alpha=40, beta=0.1)\n",
-    "    ℓp_smooth = pm.Normal(\"ℓp_smooth \", mu=1.0, sd=0.05)\n",
+    "    ℓp_smooth = pm.Normal(\"ℓp_smooth \", mu=1.0, sigma=0.05)\n",
     "    period = 1 * 0.01  # we know the period is annual\n",
     "    cov_p = ηp**2 * pm.gp.cov.Periodic(1, period, ℓp_smooth) \\\n",
     "                  * pm.gp.cov.ExpQuad(1, ℓp_decay)\n",
@@ -1531,14 +1531,14 @@
     "    # - x location uncertainty (sd = 0.01 is a standard deviation of one year)\n",
     "    # - only the first 111 points are the ice core data\n",
     "    t_mu = t_n[:111]\n",
-    "    t_diff = pm.Normal(\"t_diff\", mu=0.0, sd=0.02, shape=len(t_mu))\n",
+    "    t_diff = pm.Normal(\"t_diff\", mu=0.0, sigma=0.02, shape=len(t_mu))\n",
     "    t_uncert = t_mu - t_diff\n",
     "    t_combined = tt.concatenate((t_uncert, t_n[111:]), 0)\n",
     "    \n",
     "    # Noise covariance, using boundary avoiding priors for MAP estimation\n",
     "    σ1 = pm.Gamma(\"σ1\", alpha=3, beta=50)\n",
     "    σ2 = pm.Gamma(\"σ2\", alpha=3, beta=50)\n",
-    "    η_noise = pm.HalfNormal(\"η_noise\", sd=1)\n",
+    "    η_noise = pm.HalfNormal(\"η_noise\", sigma=1)\n",
     "    ℓ_noise = pm.Gamma(\"ℓ_noise\", alpha=2, beta=200)\n",
     "    cov_noise = η_noise**2 * pm.gp.cov.Matern32(1, ℓ_noise) +\\\n",
     "                CustomWhiteNoise(σ1, σ2, 111, 545)\n",
diff --git a/docs/source/notebooks/GP-smoothing.ipynb b/docs/source/notebooks/GP-smoothing.ipynb
index a637ae31cc..4ad5b4c884 100644
--- a/docs/source/notebooks/GP-smoothing.ipynb
+++ b/docs/source/notebooks/GP-smoothing.ipynb
@@ -185,7 +185,7 @@
     "model = pm.Model()\n",
     "with model:\n",
     "    smoothing_param = shared(0.9)\n",
-    "    mu = pm.Normal(\"mu\", sd=LARGE_NUMBER)\n",
+    "    mu = pm.Normal(\"mu\", sigma=LARGE_NUMBER)\n",
     "    tau = pm.Exponential(\"tau\", 1.0/LARGE_NUMBER)\n",
     "    z = GaussianRandomWalk(\"z\",\n",
     "                           mu=mu,\n",
diff --git a/docs/source/notebooks/MvGaussianRandomWalk_demo.ipynb b/docs/source/notebooks/MvGaussianRandomWalk_demo.ipynb
index 87f3ae7ed2..8218d80579 100644
--- a/docs/source/notebooks/MvGaussianRandomWalk_demo.ipynb
+++ b/docs/source/notebooks/MvGaussianRandomWalk_demo.ipynb
@@ -145,7 +145,7 @@
     "        regression = alpha_r+beta_r*t_t\n",
     "\n",
     "        sd = pm.Uniform('sd', 0, 1)\n",
-    "        likelihood = pm.Normal('y', mu=regression, sd=sd, observed=y_t)\n",
+    "        likelihood = pm.Normal('y', mu=regression, sigma=sd, observed=y_t)\n",
     "        trace = pm.sample(n_samples, cores=4)\n",
     "\n",
     "    return trace, y_scaler, t_scaler, t_section"
diff --git a/docs/source/notebooks/api_quickstart.ipynb b/docs/source/notebooks/api_quickstart.ipynb
index 5a0333ec89..4d8131339d 100644
--- a/docs/source/notebooks/api_quickstart.ipynb
+++ b/docs/source/notebooks/api_quickstart.ipynb
@@ -76,8 +76,8 @@
    "outputs": [],
    "source": [
     "with pm.Model() as model:\n",
-    "    mu = pm.Normal('mu', mu=0, sd=1)\n",
-    "    obs = pm.Normal('obs', mu=mu, sd=1, observed=np.random.randn(100))"
+    "    mu = pm.Normal('mu', mu=0, sigma=1)\n",
+    "    obs = pm.Normal('obs', mu=mu, sigma=1, observed=np.random.randn(100))"
    ]
   },
   {
@@ -271,7 +271,7 @@
       " |  .. code-block:: python\n",
       " |  \n",
       " |      with pm.Model():\n",
-      " |          x = pm.Normal('x', mu=0, sd=10)\n",
+      " |          x = pm.Normal('x', mu=0, sigma=10)\n",
       " |          \n",
       " |      with pm.Model():\n",
       " |          x = pm.Normal('x', mu=0, tau=1/23)\n",
@@ -284,7 +284,7 @@
       " |  \n",
       " |  Methods defined here:\n",
       " |  \n",
-      " |  __init__(self, mu=0, sd=None, tau=None, **kwargs)\n",
+      " |  __init__(self, mu=0, sigma=None, tau=None, **kwargs)\n",
       " |      Initialize self.  See help(type(self)) for accurate signature.\n",
       " |  \n",
       " |  logp(self, value)\n",
@@ -425,7 +425,7 @@
    "outputs": [],
    "source": [
     "with pm.Model():\n",
-    "    x = pm.Normal('x', mu=0, sd=1)"
+    "    x = pm.Normal('x', mu=0, sigma=1)"
    ]
   },
   {
@@ -476,7 +476,7 @@
    "outputs": [],
    "source": [
     "with pm.Model():\n",
-    "    obs = pm.Normal('x', mu=0, sd=1, observed=np.random.randn(100))"
+    "    obs = pm.Normal('x', mu=0, sigma=1, observed=np.random.randn(100))"
    ]
   },
   {
@@ -507,7 +507,7 @@
    "outputs": [],
    "source": [
     "with pm.Model():\n",
-    "    x = pm.Normal('x', mu=0, sd=1)\n",
+    "    x = pm.Normal('x', mu=0, sigma=1)\n",
     "    y = pm.Gamma('y', alpha=1, beta=1)\n",
     "    plus_2 = x + 2\n",
     "    summed = x + y\n",
@@ -529,7 +529,7 @@
    "outputs": [],
    "source": [
     "with pm.Model():\n",
-    "    x = pm.Normal('x', mu=0, sd=1)\n",
+    "    x = pm.Normal('x', mu=0, sigma=1)\n",
     "    plus_2 = pm.Deterministic('x plus 2', x + 2)"
    ]
   },
@@ -810,7 +810,7 @@
    "outputs": [],
    "source": [
     "with pm.Model():\n",
-    "    x = [pm.Normal('x_{}'.format(i), mu=0, sd=1) for i in range(10)] # bad"
+    "    x = [pm.Normal('x_{}'.format(i), mu=0, sigma=1) for i in range(10)] # bad"
    ]
   },
   {
@@ -827,7 +827,7 @@
    "outputs": [],
    "source": [
     "with pm.Model() as model:\n",
-    "    x = pm.Normal('x', mu=0, sd=1, shape=10) # good"
+    "    x = pm.Normal('x', mu=0, sigma=1, shape=10) # good"
    ]
   },
   {
@@ -875,7 +875,7 @@
    ],
    "source": [
     "with pm.Model():\n",
-    "    x = pm.Normal('x', mu=0, sd=1, shape=5)\n",
+    "    x = pm.Normal('x', mu=0, sigma=1, shape=5)\n",
     "\n",
     "x.tag.test_value"
    ]
@@ -898,7 +898,7 @@
    ],
    "source": [
     "with pm.Model():\n",
-    "    x = pm.Normal('x', mu=0, sd=1, shape=5, testval=np.random.randn(5))\n",
+    "    x = pm.Normal('x', mu=0, sigma=1, shape=5, testval=np.random.randn(5))\n",
     "\n",
     "x.tag.test_value"
    ]
@@ -942,8 +942,8 @@
    ],
    "source": [
     "with pm.Model() as model:\n",
-    "    mu = pm.Normal('mu', mu=0, sd=1)\n",
-    "    obs = pm.Normal('obs', mu=mu, sd=1, observed=np.random.randn(100))\n",
+    "    mu = pm.Normal('mu', mu=0, sigma=1)\n",
+    "    obs = pm.Normal('obs', mu=mu, sigma=1, observed=np.random.randn(100))\n",
     "    \n",
     "    trace = pm.sample(1000, tune=500)"
    ]
@@ -1003,8 +1003,8 @@
    ],
    "source": [
     "with pm.Model() as model:\n",
-    "    mu = pm.Normal('mu', mu=0, sd=1)\n",
-    "    obs = pm.Normal('obs', mu=mu, sd=1, observed=np.random.randn(100))\n",
+    "    mu = pm.Normal('mu', mu=0, sigma=1)\n",
+    "    obs = pm.Normal('obs', mu=mu, sigma=1, observed=np.random.randn(100))\n",
     "    \n",
     "    trace = pm.sample(cores=4)"
    ]
@@ -1148,8 +1148,8 @@
    ],
    "source": [
     "with pm.Model() as model:\n",
-    "    mu = pm.Normal('mu', mu=0, sd=1)\n",
-    "    obs = pm.Normal('obs', mu=mu, sd=1, observed=np.random.randn(100))\n",
+    "    mu = pm.Normal('mu', mu=0, sigma=1)\n",
+    "    obs = pm.Normal('obs', mu=mu, sigma=1, observed=np.random.randn(100))\n",
     "    \n",
     "    step = pm.Metropolis()\n",
     "    trace = pm.sample(1000, step=step)"
@@ -1182,9 +1182,9 @@
    ],
    "source": [
     "with pm.Model() as model:\n",
-    "    mu = pm.Normal('mu', mu=0, sd=1)\n",
-    "    sd = pm.HalfNormal('sd', sd=1)\n",
-    "    obs = pm.Normal('obs', mu=mu, sd=sd, observed=np.random.randn(100))\n",
+    "    mu = pm.Normal('mu', mu=0, sigma=1)\n",
+    "    sd = pm.HalfNormal('sd', sigma=1)\n",
+    "    obs = pm.Normal('obs', mu=mu, sigma=sd, observed=np.random.randn(100))\n",
     "    \n",
     "    step1 = pm.Metropolis(vars=[mu])\n",
     "    step2 = pm.Slice(vars=[sd])\n",
@@ -1337,7 +1337,7 @@
    ],
    "source": [
     "with pm.Model() as model:\n",
-    "    x = pm.Normal('x', mu=0, sd=1, shape=100)    \n",
+    "    x = pm.Normal('x', mu=0, sigma=1, shape=100)    \n",
     "    trace = pm.sample(cores=4)\n",
     "    \n",
     "pm.energyplot(trace);"
@@ -1375,9 +1375,9 @@
    ],
    "source": [
     "with pm.Model() as model:\n",
-    "    mu = pm.Normal('mu', mu=0, sd=1)\n",
-    "    sd = pm.HalfNormal('sd', sd=1)\n",
-    "    obs = pm.Normal('obs', mu=mu, sd=sd, observed=np.random.randn(100))\n",
+    "    mu = pm.Normal('mu', mu=0, sigma=1)\n",
+    "    sd = pm.HalfNormal('sd', sigma=1)\n",
+    "    obs = pm.Normal('obs', mu=mu, sigma=sd, observed=np.random.randn(100))\n",
     "    \n",
     "    approx = pm.fit()"
    ]
@@ -1512,7 +1512,7 @@
     "mu = pm.floatX([-.3, .5])\n",
     "sd = pm.floatX([.1, .1])\n",
     "with pm.Model() as model:\n",
-    "    pm.NormalMixture('x', w=w, mu=mu, sd=sd)\n",
+    "    pm.NormalMixture('x', w=w, mu=mu, sigma=sd)\n",
     "    approx = pm.fit(method=pm.SVGD(n_particles=200, jitter=1.))"
    ]
   },
@@ -1574,9 +1574,9 @@
    "source": [
     "data = np.random.randn(100)\n",
     "with pm.Model() as model:\n",
-    "    mu = pm.Normal('mu', mu=0, sd=1)\n",
-    "    sd = pm.HalfNormal('sd', sd=1)\n",
-    "    obs = pm.Normal('obs', mu=mu, sd=sd, observed=data)\n",
+    "    mu = pm.Normal('mu', mu=0, sigma=1)\n",
+    "    sd = pm.HalfNormal('sd', sigma=1)\n",
+    "    obs = pm.Normal('obs', mu=mu, sigma=sd, observed=data)\n",
     "    \n",
     "    trace = pm.sample()"
    ]
@@ -1687,7 +1687,7 @@
     "y_shared = theano.shared(y)\n",
     "\n",
     "with pm.Model() as model:\n",
-    "    coeff = pm.Normal('x', mu=0, sd=1)\n",
+    "    coeff = pm.Normal('x', mu=0, sigma=1)\n",
     "    logistic = pm.math.sigmoid(coeff * x_shared)\n",
     "    pm.Bernoulli('obs', p=logistic, observed=y_shared)\n",
     "    trace = pm.sample()"
diff --git a/docs/source/notebooks/bayesian_neural_network_advi.ipynb b/docs/source/notebooks/bayesian_neural_network_advi.ipynb
index f0248b38e2..a9cfbce7da 100644
--- a/docs/source/notebooks/bayesian_neural_network_advi.ipynb
+++ b/docs/source/notebooks/bayesian_neural_network_advi.ipynb
@@ -154,17 +154,17 @@
     "        \n",
     "    with pm.Model() as neural_network:\n",
     "        # Weights from input to hidden layer\n",
-    "        weights_in_1 = pm.Normal('w_in_1', 0, sd=1, \n",
+    "        weights_in_1 = pm.Normal('w_in_1', 0, sigma=1, \n",
     "                                 shape=(X.shape[1], n_hidden), \n",
     "                                 testval=init_1)\n",
     "        \n",
     "        # Weights from 1st to 2nd layer\n",
-    "        weights_1_2 = pm.Normal('w_1_2', 0, sd=1, \n",
+    "        weights_1_2 = pm.Normal('w_1_2', 0, sigma=1, \n",
     "                                shape=(n_hidden, n_hidden), \n",
     "                                testval=init_2)\n",
     "        \n",
     "        # Weights from hidden layer to output\n",
-    "        weights_2_out = pm.Normal('w_2_out', 0, sd=1, \n",
+    "        weights_2_out = pm.Normal('w_2_out', 0, sigma=1, \n",
     "                                  shape=(n_hidden,), \n",
     "                                  testval=init_out)\n",
     "        \n",
diff --git a/docs/source/notebooks/bayesian_neural_network_with_sgfs.ipynb b/docs/source/notebooks/bayesian_neural_network_with_sgfs.ipynb
index 13fef60d75..cdd67056e9 100644
--- a/docs/source/notebooks/bayesian_neural_network_with_sgfs.ipynb
+++ b/docs/source/notebooks/bayesian_neural_network_with_sgfs.ipynb
@@ -161,17 +161,17 @@
     "def build_network(ann_input, ann_output):\n",
     "    with pm.Model() as model:\n",
     "        # Weights from input to hidden layer\n",
-    "        weights_in_1 = pm.Normal('w_in_1', 0, sd=1, \n",
+    "        weights_in_1 = pm.Normal('w_in_1', 0, sigma=1, \n",
     "                                 shape=(X.shape[1], n_hidden), \n",
     "                                 testval=init_1)\n",
     "\n",
     "        # Weights from 1st to 2nd layer\n",
-    "        weights_1_2 = pm.Normal('w_1_2', 0, sd=1, \n",
+    "        weights_1_2 = pm.Normal('w_1_2', 0, sigma=1, \n",
     "                                shape=(n_hidden, n_hidden), \n",
     "                                testval=init_2)\n",
     "\n",
     "        # Weights from hidden layer to output\n",
-    "        weights_2_out = pm.Normal('w_2_out', 0, sd=1,\n",
+    "        weights_2_out = pm.Normal('w_2_out', 0, sigma=1,\n",
     "                                  shape=(n_hidden,), \n",
     "                                  testval=init_out)\n",
     "\n",
diff --git a/docs/source/notebooks/blackbox_external_likelihood.ipynb b/docs/source/notebooks/blackbox_external_likelihood.ipynb
index c990a808f9..99705783c3 100644
--- a/docs/source/notebooks/blackbox_external_likelihood.ipynb
+++ b/docs/source/notebooks/blackbox_external_likelihood.ipynb
@@ -830,7 +830,7 @@
     "    m = pm.Uniform('m', lower=-10., upper=10.)\n",
     "    c = pm.Uniform('c', lower=-10., upper=10.)\n",
     "\n",
-    "    pm.Normal('likelihood', mu=(m*x + c), sd=sigma, observed=data)\n",
+    "    pm.Normal('likelihood', mu=(m*x + c), sigma=sigma, observed=data)\n",
     "\n",
     "    gradfunc = test_model.logp_dlogp_function([m, c], dtype=None)\n",
     "    gradfunc.set_extra_values({'m_interval__': mtrue, 'c_interval__': ctrue})\n",
diff --git a/docs/source/notebooks/censored_data.ipynb b/docs/source/notebooks/censored_data.ipynb
index f60c8066d7..b408760743 100644
--- a/docs/source/notebooks/censored_data.ipynb
+++ b/docs/source/notebooks/censored_data.ipynb
@@ -139,9 +139,9 @@
    "source": [
     "# Uncensored model\n",
     "with pm.Model() as uncensored_model:\n",
-    "    mu = pm.Normal('mu', mu=0., sd=(high - low) / 2.)\n",
-    "    sigma = pm.HalfNormal('sigma', sd=(high - low) / 2.)\n",
-    "    observed = pm.Normal('observed', mu=mu, sd=sigma, observed=samples)"
+    "    mu = pm.Normal('mu', mu=0., sigma=(high - low) / 2.)\n",
+    "    sigma = pm.HalfNormal('sigma', sigma=(high - low) / 2.)\n",
+    "    observed = pm.Normal('observed', mu=mu, sigma=sigma, observed=samples)"
    ]
   },
   {
@@ -167,20 +167,20 @@
     "n_observed = len(samples) - n_right_censored - n_left_censored\n",
     "\n",
     "with pm.Model() as imputed_censored_model:\n",
-    "    mu = pm.Normal('mu', mu=0., sd=(high - low) / 2.)\n",
-    "    sigma = pm.HalfNormal('sigma', sd=(high - low) / 2.)\n",
+    "    mu = pm.Normal('mu', mu=0., sigma=(high - low) / 2.)\n",
+    "    sigma = pm.HalfNormal('sigma', sigma=(high - low) / 2.)\n",
     "    \n",
     "    right_censored = pm.Bound(pm.Normal, lower=high)(\n",
-    "        'right_censored', mu=mu, sd=sigma, shape=n_right_censored\n",
+    "        'right_censored', mu=mu, sigma=sigma, shape=n_right_censored\n",
     "    )\n",
     "    left_censored = pm.Bound(pm.Normal, upper=low)(\n",
-    "        'left_censored', mu=mu, sd=sigma, shape=n_left_censored\n",
+    "        'left_censored', mu=mu, sigma=sigma, shape=n_left_censored\n",
     "    )\n",
     "    \n",
     "    observed = pm.Normal(\n",
     "        'observed',\n",
     "        mu=mu,\n",
-    "        sd=sigma,\n",
+    "        sigma=sigma,\n",
     "        observed=censored,\n",
     "        shape=n_observed\n",
     "    )"
@@ -227,13 +227,13 @@
    "source": [
     "# Unimputed censored model\n",
     "with pm.Model() as unimputed_censored_model:\n",
-    "    mu = pm.Normal('mu', mu=0., sd=(high - low) / 2.)\n",
-    "    sigma = pm.HalfNormal('sigma', sd=(high - low) / 2.)\n",
+    "    mu = pm.Normal('mu', mu=0., sigma=(high - low) / 2.)\n",
+    "    sigma = pm.HalfNormal('sigma', sigma=(high - low) / 2.)\n",
     "    \n",
     "    observed = pm.Normal(\n",
     "        'observed',\n",
     "        mu=mu,\n",
-    "        sd=sigma,\n",
+    "        sigma=sigma,\n",
     "        observed=censored,\n",
     "    )\n",
     "    \n",
diff --git a/docs/source/notebooks/constant_stochastic_gradient.ipynb b/docs/source/notebooks/constant_stochastic_gradient.ipynb
index 12168d3187..bcbf5abdd9 100644
--- a/docs/source/notebooks/constant_stochastic_gradient.ipynb
+++ b/docs/source/notebooks/constant_stochastic_gradient.ipynb
@@ -202,12 +202,12 @@
     "model_output = theano.shared(train_Y, name='Y')\n",
     "\n",
     "with pm.Model() as model:\n",
-    "    b0 = pm.Normal(\"Intercept\", mu=0.0, sd=1.0)\n",
+    "    b0 = pm.Normal(\"Intercept\", mu=0.0, sigma=1.0)\n",
     "    b1 = pm.Normal(\"Slope\", mu=0.0, shape=(q_size,))\n",
-    "    std = pm.HalfNormal(\"std\", sd=1.0)\n",
+    "    std = pm.HalfNormal(\"std\", sigma=1.0)\n",
     "\n",
     "    mu = b0 + theano.dot(model_input, b1)      \n",
-    "    y_obs = pm.Normal(\"y_obs\", mu=mu, sd=std, observed=model_output) "
+    "    y_obs = pm.Normal(\"y_obs\", mu=mu, sigma=std, observed=model_output) "
    ]
   },
   {
@@ -309,12 +309,12 @@
     "model_output = theano.shared(train_Y, name='Y')\n",
     "\n",
     "with pm.Model() as model:\n",
-    "    b0 = pm.Normal(\"Intercept\", mu=0.0, sd=1.0)\n",
+    "    b0 = pm.Normal(\"Intercept\", mu=0.0, sigma=1.0)\n",
     "    b1 = pm.Normal(\"Slope\", mu=0.0, shape=(q_size,))\n",
-    "    std = pm.HalfNormal(\"std\", sd=1.0)\n",
+    "    std = pm.HalfNormal(\"std\", sigma=1.0)\n",
     "\n",
     "    mu = b0 + theano.dot(model_input, b1)      \n",
-    "    y_obs = pm.Normal(\"y_obs\", mu=mu, sd=std, observed=model_output) \n",
+    "    y_obs = pm.Normal(\"y_obs\", mu=mu, sigma=std, observed=model_output) \n",
     "    \n",
     "minibatch_tensors = [model_input, model_output]\n",
     "\n",
@@ -347,12 +347,12 @@
     "model_output = theano.shared(train_Y, name='Y')\n",
     "\n",
     "with pm.Model() as model:\n",
-    "    b0 = pm.Normal(\"Intercept\", mu=0.0, sd=1.0)\n",
+    "    b0 = pm.Normal(\"Intercept\", mu=0.0, sigma=1.0)\n",
     "    b1 = pm.Normal(\"Slope\", mu=0.0, shape=(q_size,))\n",
-    "    std = pm.HalfNormal(\"std\", sd=1.0)\n",
+    "    std = pm.HalfNormal(\"std\", sigma=1.0)\n",
     "\n",
     "    mu = b0 + theano.dot(model_input, b1)      \n",
-    "    y_obs = pm.Normal(\"y_obs\", mu=mu, sd=std, observed=model_output) \n",
+    "    y_obs = pm.Normal(\"y_obs\", mu=mu, sigma=std, observed=model_output) \n",
     "    \n",
     "minibatch_tensors = [model_input, model_output]\n",
     "\n",
diff --git a/docs/source/notebooks/convolutional_vae_keras_advi.ipynb b/docs/source/notebooks/convolutional_vae_keras_advi.ipynb
index 9495b9be88..32d93a4de9 100644
--- a/docs/source/notebooks/convolutional_vae_keras_advi.ipynb
+++ b/docs/source/notebooks/convolutional_vae_keras_advi.ipynb
@@ -475,13 +475,13 @@
    "source": [
     "with pm.Model() as model:\n",
     "    # Hidden variables\n",
-    "    zs = pm.Normal('zs', mu=0, sd=1, shape=(minibatch_size, dim_hidden), dtype='float32', total_size=len(data))\n",
+    "    zs = pm.Normal('zs', mu=0, sigma=1, shape=(minibatch_size, dim_hidden), dtype='float32', total_size=len(data))\n",
     "\n",
     "    # Decoder and its parameters\n",
     "    dec = Decoder(zs, net=cnn_dec)\n",
     "    \n",
     "    # Observation model\n",
-    "    xs_ = pm.Normal('xs_', mu=dec.out, sd=0.1, observed=xs_t, dtype='float32', total_size=len(data))"
+    "    xs_ = pm.Normal('xs_', mu=dec.out, sigma=0.1, observed=xs_t, dtype='float32', total_size=len(data))"
    ]
   },
   {
diff --git a/docs/source/notebooks/empirical-approx-overview.ipynb b/docs/source/notebooks/empirical-approx-overview.ipynb
index 5686c712dd..419745e219 100644
--- a/docs/source/notebooks/empirical-approx-overview.ipynb
+++ b/docs/source/notebooks/empirical-approx-overview.ipynb
@@ -58,7 +58,7 @@
     "sd = pm.floatX([.1, .1])\n",
     "\n",
     "with pm.Model() as model:\n",
-    "    x = pm.NormalMixture('x', w=w, mu=mu, sd=sd, dtype=theano.config.floatX)\n",
+    "    x = pm.NormalMixture('x', w=w, mu=mu, sigma=sd, dtype=theano.config.floatX)\n",
     "    trace = pm.sample(50000)"
    ]
   },
diff --git a/docs/source/notebooks/gaussian_mixture_model.ipynb b/docs/source/notebooks/gaussian_mixture_model.ipynb
index 810ce245e0..a217cad001 100644
--- a/docs/source/notebooks/gaussian_mixture_model.ipynb
+++ b/docs/source/notebooks/gaussian_mixture_model.ipynb
@@ -104,7 +104,7 @@
     "\n",
     "\n",
     "    # cluster centers\n",
-    "    means = pm.Normal('means', mu=[0, 0, 0], sd=15, shape=k)\n",
+    "    means = pm.Normal('means', mu=[0, 0, 0], sigma=15, shape=k)\n",
     "    # break symmetry\n",
     "    order_means_potential = pm.Potential('order_means_potential',\n",
     "                                         tt.switch(means[1]-means[0] < 0, -np.inf, 0)\n",
@@ -121,7 +121,7 @@
     "    # likelihood for each observed value\n",
     "    points = pm.Normal('obs',\n",
     "                       mu=means[category],\n",
-    "                       sd=sd,\n",
+    "                       sigma=sd,\n",
     "                       observed=data)"
    ]
   },
diff --git a/docs/source/notebooks/getting_started.ipynb b/docs/source/notebooks/getting_started.ipynb
index 749f4f9093..bd212ae5db 100644
--- a/docs/source/notebooks/getting_started.ipynb
+++ b/docs/source/notebooks/getting_started.ipynb
@@ -192,15 +192,15 @@
     "with basic_model:\n",
     "    \n",
     "    # Priors for unknown model parameters\n",
-    "    alpha = pm.Normal('alpha', mu=0, sd=10)\n",
-    "    beta = pm.Normal('beta', mu=0, sd=10, shape=2)\n",
-    "    sigma = pm.HalfNormal('sigma', sd=1)\n",
+    "    alpha = pm.Normal('alpha', mu=0, sigma=10)\n",
+    "    beta = pm.Normal('beta', mu=0, sigma=10, shape=2)\n",
+    "    sigma = pm.HalfNormal('sigma', sigma=1)\n",
     "    \n",
     "    # Expected value of outcome\n",
     "    mu = alpha + beta[0]*X1 + beta[1]*X2\n",
     "    \n",
     "    # Likelihood (sampling distribution) of observations\n",
-    "    Y_obs = pm.Normal('Y_obs', mu=mu, sd=sigma, observed=Y)"
+    "    Y_obs = pm.Normal('Y_obs', mu=mu, sigma=sigma, observed=Y)"
    ]
   },
   {
@@ -225,9 +225,9 @@
     "The first three statements in the context manager:\n",
     "\n",
     "```python\n",
-    "alpha = Normal('alpha', mu=0, sd=10)\n",
-    "beta = Normal('beta', mu=0, sd=10, shape=2)\n",
-    "sigma = HalfNormal('sigma', sd=1)\n",
+    "alpha = Normal('alpha', mu=0, sigma=10)\n",
+    "beta = Normal('beta', mu=0, sigma=10, shape=2)\n",
+    "sigma = HalfNormal('sigma', sigma=1)\n",
     "```\n",
     "create **stochastic** random variables with Normal prior distributions for the regression coefficients with a mean of 0 and standard deviation of 10, and a half-normal distribution for the standard deviation of the observations, $\\sigma$. These are stochastic because their values are partly determined by its parents in the dependency graph of random variables, which for priors are simple constants, and partly random (or stochastic). \n",
     "\n",
@@ -309,7 +309,7 @@
       " |  .. code-block:: python\n",
       " |  \n",
       " |      with pm.Model():\n",
-      " |          x = pm.Normal('x', mu=0, sd=10)\n",
+      " |          x = pm.Normal('x', mu=0, sigma=10)\n",
       " |  \n",
       " |      with pm.Model():\n",
       " |          x = pm.Normal('x', mu=0, tau=1/23)\n",
@@ -322,7 +322,7 @@
       " |  \n",
       " |  Methods defined here:\n",
       " |  \n",
-      " |  __init__(self, mu=0, sd=None, tau=None, **kwargs)\n",
+      " |  __init__(self, mu=0, sigma=None, tau=None, **kwargs)\n",
       " |      Initialize self.  See help(type(self)) for accurate signature.\n",
       " |  \n",
       " |  logcdf(self, value)\n",
@@ -431,7 +431,7 @@
     "The final line of the model, defines `Y_obs`, the sampling distribution of the outcomes in the dataset.\n",
     "\n",
     "```python\n",
-    "Y_obs = Normal('Y_obs', mu=mu, sd=sigma, observed=Y)\n",
+    "Y_obs = Normal('Y_obs', mu=mu, sigma=sigma, observed=Y)\n",
     "```\n",
     "\n",
     "This is a special case of a stochastic variable that we call an **observed stochastic**, and represents the data likelihood of the model. It is identical to a standard stochastic, except that its `observed` argument, which passes the data to the variable, indicates that the values for this variable were observed, and should not be changed by any fitting algorithm applied to the model. The data can be passed in the form of either a `numpy.ndarray` or `pandas.DataFrame` object.\n",
@@ -899,10 +899,10 @@
     "    nu = pm.Exponential('nu', 1/10., testval=5.)\n",
     "    sigma = pm.Exponential('sigma', 1/0.02, testval=.1)\n",
     "\n",
-    "    s = pm.GaussianRandomWalk('s', sd=sigma, shape=len(returns))\n",
+    "    s = pm.GaussianRandomWalk('s', sigma=sigma, shape=len(returns))\n",
     "    volatility_process = pm.Deterministic('volatility_process', pm.math.exp(-2*s)**0.5)\n",
     "\n",
-    "    r = pm.StudentT('r', nu=nu, sd=volatility_process, observed=returns['change'])"
+    "    r = pm.StudentT('r', nu=nu, sigma=volatility_process, observed=returns['change'])"
    ]
   },
   {
@@ -1293,7 +1293,7 @@
     "    eps = pm.DensityDist('eps', lambda value: -tt.log(tt.abs_(value)), testval=1)\n",
     "    \n",
     "    # Create likelihood\n",
-    "    like = pm.Normal('y_est', mu=alpha + beta * X, sd=eps, observed=Y)\n",
+    "    like = pm.Normal('y_est', mu=alpha + beta * X, sigma=eps, observed=Y)\n",
     "```"
    ]
   },
diff --git a/docs/source/notebooks/howto_debugging.ipynb b/docs/source/notebooks/howto_debugging.ipynb
index 99e5b30f8e..7c3c136762 100644
--- a/docs/source/notebooks/howto_debugging.ipynb
+++ b/docs/source/notebooks/howto_debugging.ipynb
@@ -55,10 +55,10 @@
     "x = np.random.randn(100)\n",
     "\n",
     "with pm.Model() as model:\n",
-    "    mu = pm.Normal('mu', mu=0, sd=1)\n",
-    "    sd = pm.Normal('sd', mu=0, sd=1)\n",
+    "    mu = pm.Normal('mu', mu=0, sigma=1)\n",
+    "    sd = pm.Normal('sd', mu=0, sigma=1)\n",
     "\n",
-    "    obs = pm.Normal('obs', mu=mu, sd=sd, observed=x)\n",
+    "    obs = pm.Normal('obs', mu=mu, sigma=sd, observed=x)\n",
     "    step = pm.Metropolis()\n",
     "    trace = pm.sample(5000, step)\n",
     "pm.traceplot(trace);"
@@ -121,13 +121,13 @@
    ],
    "source": [
     "with pm.Model() as model:\n",
-    "    mu = pm.Normal('mu', mu=0, sd=1)\n",
-    "    sd = pm.Normal('sd', mu=0, sd=1)\n",
+    "    mu = pm.Normal('mu', mu=0, sigma=1)\n",
+    "    sd = pm.Normal('sd', mu=0, sigma=1)\n",
     "    \n",
     "    mu_print = tt.printing.Print('mu')(mu)\n",
     "    sd_print = tt.printing.Print('sd')(sd)\n",
     "    \n",
-    "    obs = pm.Normal('obs', mu=mu_print, sd=sd_print, observed=x)\n",
+    "    obs = pm.Normal('obs', mu=mu_print, sigma=sd_print, observed=x)\n",
     "    step = pm.Metropolis()\n",
     "    trace = pm.sample(3, step, tune=0, chains=1, progressbar=False) # Make sure not to draw too many samples"
    ]
@@ -179,13 +179,13 @@
     "mystdout = sys.stdout = StringIO()\n",
     "\n",
     "with pm.Model() as model:\n",
-    "    mu = pm.Normal('mu', mu=0, sd=1)\n",
-    "    sd = pm.Normal('sd', mu=0, sd=1)\n",
+    "    mu = pm.Normal('mu', mu=0, sigma=1)\n",
+    "    sd = pm.Normal('sd', mu=0, sigma=1)\n",
     "\n",
     "    mu_print = tt.printing.Print('mu')(mu)\n",
     "    sd_print = tt.printing.Print('sd')(sd)\n",
     "\n",
-    "    obs = pm.Normal('obs', mu=mu_print, sd=sd_print, observed=x)\n",
+    "    obs = pm.Normal('obs', mu=mu_print, sigma=sd_print, observed=x)\n",
     "    step = pm.Metropolis()\n",
     "    trace = pm.sample(5, step, tune=0, chains=1, progressbar=False) # Make sure not to draw too many samples\n",
     "\n",
diff --git a/docs/source/notebooks/lasso_block_update.ipynb b/docs/source/notebooks/lasso_block_update.ipynb
index bc22aea692..8e850cbde3 100644
--- a/docs/source/notebooks/lasso_block_update.ipynb
+++ b/docs/source/notebooks/lasso_block_update.ipynb
@@ -66,7 +66,7 @@
     "    \n",
     "    p = d1*m1 + d2*m2\n",
     "    \n",
-    "    y = Normal('y', mu=p, sd=s, observed=yd) "
+    "    y = Normal('y', mu=p, sigma=s, observed=yd) "
    ]
   },
   {
diff --git a/docs/source/notebooks/model_averaging.ipynb b/docs/source/notebooks/model_averaging.ipynb
index a56246050a..f2a568aa45 100644
--- a/docs/source/notebooks/model_averaging.ipynb
+++ b/docs/source/notebooks/model_averaging.ipynb
@@ -177,13 +177,13 @@
    ],
    "source": [
     "with pm.Model() as model_0:\n",
-    "    alpha = pm.Normal('alpha', mu=0, sd=10)\n",
-    "    beta = pm.Normal('beta', mu=0, sd=10)\n",
+    "    alpha = pm.Normal('alpha', mu=0, sigma=10)\n",
+    "    beta = pm.Normal('beta', mu=0, sigma=10)\n",
     "    sigma = pm.HalfNormal('sigma', 10)\n",
     "    \n",
     "    mu = alpha + beta * d['neocortex']\n",
     "    \n",
-    "    kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=d['kcal.per.g'])\n",
+    "    kcal = pm.Normal('kcal', mu=mu, sigma=sigma, observed=d['kcal.per.g'])\n",
     "    trace_0 = pm.sample(2000)"
    ]
   },
@@ -215,13 +215,13 @@
    ],
    "source": [
     "with pm.Model() as model_1:\n",
-    "    alpha = pm.Normal('alpha', mu=0, sd=10)\n",
-    "    beta = pm.Normal('beta', mu=0, sd=1)\n",
+    "    alpha = pm.Normal('alpha', mu=0, sigma=10)\n",
+    "    beta = pm.Normal('beta', mu=0, sigma=1)\n",
     "    sigma = pm.HalfNormal('sigma', 10)\n",
     "    \n",
     "    mu = alpha + beta * d['log_mass']\n",
     "    \n",
-    "    kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=d['kcal.per.g'])\n",
+    "    kcal = pm.Normal('kcal', mu=mu, sigma=sigma, observed=d['kcal.per.g'])\n",
     "    \n",
     "    trace_1 = pm.sample(2000)"
    ]
@@ -254,13 +254,13 @@
    ],
    "source": [
     "with pm.Model() as model_2:\n",
-    "    alpha = pm.Normal('alpha', mu=0, sd=10)\n",
-    "    beta = pm.Normal('beta', mu=0, sd=1, shape=2)\n",
+    "    alpha = pm.Normal('alpha', mu=0, sigma=10)\n",
+    "    beta = pm.Normal('beta', mu=0, sigma=1, shape=2)\n",
     "    sigma = pm.HalfNormal('sigma', 10)\n",
     "\n",
     "    mu = alpha + pm.math.dot(beta, d[['neocortex','log_mass']].T)\n",
     "\n",
-    "    kcal = pm.Normal('kcal', mu=mu, sd=sigma, observed=d['kcal.per.g'])\n",
+    "    kcal = pm.Normal('kcal', mu=mu, sigma=sigma, observed=d['kcal.per.g'])\n",
     "\n",
     "    trace_2 = pm.sample(2000)"
    ]
diff --git a/docs/source/notebooks/model_comparison.ipynb b/docs/source/notebooks/model_comparison.ipynb
index 2ed2cc796c..9142940ca0 100644
--- a/docs/source/notebooks/model_comparison.ipynb
+++ b/docs/source/notebooks/model_comparison.ipynb
@@ -66,9 +66,9 @@
    ],
    "source": [
     "with pm.Model() as pooled:\n",
-    "    mu = pm.Normal('mu', 0, sd=1e6)\n",
+    "    mu = pm.Normal('mu', 0, sigma=1e6)\n",
     "    \n",
-    "    obs = pm.Normal('obs', mu, sd=sigma, observed=y)\n",
+    "    obs = pm.Normal('obs', mu, sigma=sigma, observed=y)\n",
     "    \n",
     "    trace_p = pm.sample(1000)"
    ]
@@ -121,12 +121,12 @@
     "with pm.Model() as hierarchical:\n",
     "    \n",
     "    eta = pm.Normal('eta', 0, 1, shape=J)\n",
-    "    mu = pm.Normal('mu', 0, sd=1e6)\n",
+    "    mu = pm.Normal('mu', 0, sigma=1e6)\n",
     "    tau = pm.HalfCauchy('tau', 5)\n",
     "    \n",
     "    theta = pm.Deterministic('theta', mu + tau*eta)\n",
     "    \n",
-    "    obs = pm.Normal('obs', theta, sd=sigma, observed=y)\n",
+    "    obs = pm.Normal('obs', theta, sigma=sigma, observed=y)\n",
     "    \n",
     "    trace_h = pm.sample(1000)"
    ]
diff --git a/docs/source/notebooks/multilevel_modeling.ipynb b/docs/source/notebooks/multilevel_modeling.ipynb
index 95be91a84e..7b02c2195c 100644
--- a/docs/source/notebooks/multilevel_modeling.ipynb
+++ b/docs/source/notebooks/multilevel_modeling.ipynb
@@ -508,12 +508,12 @@
     "\n",
     "with Model() as pooled_model:\n",
     "    \n",
-    "    beta = Normal('beta', 0, sd=1e5, shape=2)\n",
+    "    beta = Normal('beta', 0, sigma=1e5, shape=2)\n",
     "    sigma = HalfCauchy('sigma', 5)\n",
     "    \n",
     "    theta = beta[0] + beta[1]*floor\n",
     "    \n",
-    "    y = Normal('y', theta, sd=sigma, observed=log_radon)\n",
+    "    y = Normal('y', theta, sigma=sigma, observed=log_radon)\n",
     "model_to_graphviz(pooled_model)"
    ]
   },
@@ -654,13 +654,13 @@
    "source": [
     "with Model() as unpooled_model:\n",
     "    \n",
-    "    beta0 = Normal('beta0', 0, sd=1e5, shape=counties)\n",
-    "    beta1 = Normal('beta1', 0, sd=1e5)\n",
+    "    beta0 = Normal('beta0', 0, sigma=1e5, shape=counties)\n",
+    "    beta1 = Normal('beta1', 0, sigma=1e5)\n",
     "    sigma = HalfCauchy('sigma', 5)\n",
     "    \n",
     "    theta = beta0[county] + beta1*floor\n",
     "    \n",
-    "    y = Normal('y', theta, sd=sigma, observed=log_radon)\n",
+    "    y = Normal('y', theta, sigma=sigma, observed=log_radon)\n",
     "model_to_graphviz(unpooled_model)"
    ]
   },
@@ -950,11 +950,11 @@
     "with Model() as partial_pooling:\n",
     "    \n",
     "    # Priors\n",
-    "    mu_a = Normal('mu_a', mu=0., sd=1e5)\n",
+    "    mu_a = Normal('mu_a', mu=0., sigma=1e5)\n",
     "    sigma_a = HalfCauchy('sigma_a', 5)\n",
     "    \n",
     "    # Random intercepts\n",
-    "    a = Normal('a', mu=mu_a, sd=sigma_a, shape=counties)\n",
+    "    a = Normal('a', mu=mu_a, sigma=sigma_a, shape=counties)\n",
     "    \n",
     "    # Model error\n",
     "    sigma_y = HalfCauchy('sigma_y',5)\n",
@@ -963,7 +963,7 @@
     "    y_hat = a[county]\n",
     "    \n",
     "    # Data likelihood\n",
-    "    y_like = Normal('y_like', mu=y_hat, sd=sigma_y, observed=log_radon)\n",
+    "    y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed=log_radon)\n",
     "    \n",
     "model_to_graphviz(partial_pooling)"
    ]
@@ -1171,9 +1171,9 @@
     "    \n",
     "    \n",
     "    # Random intercepts\n",
-    "    a = Normal('a', mu=mu_a, sd=sigma_a, shape=counties)\n",
+    "    a = Normal('a', mu=mu_a, sigma=sigma_a, shape=counties)\n",
     "    # Common slope\n",
-    "    b = Normal('b', mu=0., sd=1e5)\n",
+    "    b = Normal('b', mu=0., sigma=1e5)\n",
     "    \n",
     "    # Model error\n",
     "    sd_y = HalfCauchy('sd_y', 5)\n",
@@ -1182,7 +1182,7 @@
     "    y_hat = a[county] + b * floor_measure\n",
     "    \n",
     "    # Data likelihood\n",
-    "    y_like = Normal('y_like', mu=y_hat, sd=sd_y, observed=log_radon)\n",
+    "    y_like = Normal('y_like', mu=y_hat, sigma=sd_y, observed=log_radon)\n",
     "\n",
     "model_to_graphviz(varying_intercept)"
    ]
@@ -1530,13 +1530,13 @@
     "with Model() as varying_slope:\n",
     "    \n",
     "    # Priors\n",
-    "    mu_b = Normal('mu_b', mu=0., sd=1e5)\n",
+    "    mu_b = Normal('mu_b', mu=0., sigma=1e5)\n",
     "    sigma_b = HalfCauchy('sigma_b', 5)\n",
     "    \n",
     "    # Common intercepts\n",
-    "    a = Normal('a', mu=0., sd=1e5)\n",
+    "    a = Normal('a', mu=0., sigma=1e5)\n",
     "    # Random slopes\n",
-    "    b = Normal('b', mu=mu_b, sd=sigma_b, shape=counties)\n",
+    "    b = Normal('b', mu=mu_b, sigma=sigma_b, shape=counties)\n",
     "    \n",
     "    # Model error\n",
     "    sigma_y = HalfCauchy('sigma_y',5)\n",
@@ -1545,7 +1545,7 @@
     "    y_hat = a + b[county] * floor_measure\n",
     "    \n",
     "    # Data likelihood\n",
-    "    y_like = Normal('y_like', mu=y_hat, sd=sigma_y, observed=log_radon)\n",
+    "    y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed=log_radon)\n",
     "\n",
     "model_to_graphviz(varying_slope)"
    ]
@@ -1753,15 +1753,15 @@
     "with Model() as varying_intercept_slope:\n",
     "    \n",
     "    # Priors\n",
-    "    mu_a = Normal('mu_a', mu=0., sd=1e5)\n",
+    "    mu_a = Normal('mu_a', mu=0., sigma=1e5)\n",
     "    sigma_a = HalfCauchy('sigma_a', 5)\n",
-    "    mu_b = Normal('mu_b', mu=0., sd=1e5)\n",
+    "    mu_b = Normal('mu_b', mu=0., sigma=1e5)\n",
     "    sigma_b = HalfCauchy('sigma_b', 5)\n",
     "    \n",
     "    # Random intercepts\n",
-    "    a = Normal('a', mu=mu_a, sd=sigma_a, shape=counties)\n",
+    "    a = Normal('a', mu=mu_a, sigma=sigma_a, shape=counties)\n",
     "    # Random slopes\n",
-    "    b = Normal('b', mu=mu_b, sd=sigma_b, shape=counties)\n",
+    "    b = Normal('b', mu=mu_b, sigma=sigma_b, shape=counties)\n",
     "    \n",
     "    # Model error\n",
     "    sigma_y = Uniform('sigma_y', lower=0, upper=100)\n",
@@ -1770,7 +1770,7 @@
     "    y_hat = a[county] + b[county] * floor_measure\n",
     "    \n",
     "    # Data likelihood\n",
-    "    y_like = Normal('y_like', mu=y_hat, sd=sigma_y, observed=log_radon)\n",
+    "    y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed=log_radon)\n",
     "    \n",
     "model_to_graphviz(varying_intercept_slope)"
    ]
@@ -1994,18 +1994,18 @@
     "    sigma_a = HalfCauchy('sigma_a', 5)\n",
     "    \n",
     "    # County uranium model for slope\n",
-    "    gamma_0 = Normal('gamma_0', mu=0., sd=1e5)\n",
-    "    gamma_1 = Normal('gamma_1', mu=0., sd=1e5)\n",
+    "    gamma_0 = Normal('gamma_0', mu=0., sigma=1e5)\n",
+    "    gamma_1 = Normal('gamma_1', mu=0., sigma=1e5)\n",
     "    \n",
     "    \n",
     "    # Uranium model for intercept\n",
     "    mu_a = gamma_0 + gamma_1*u\n",
     "    # County variation not explained by uranium\n",
-    "    eps_a = Normal('eps_a', mu=0, sd=sigma_a, shape=counties)\n",
+    "    eps_a = Normal('eps_a', mu=0, sigma=sigma_a, shape=counties)\n",
     "    a = Deterministic('a', mu_a + eps_a[county])\n",
     "    \n",
     "    # Common slope\n",
-    "    b = Normal('b', mu=0., sd=1e5)\n",
+    "    b = Normal('b', mu=0., sigma=1e5)\n",
     "    \n",
     "    # Model error\n",
     "    sigma_y = Uniform('sigma_y', lower=0, upper=100)\n",
@@ -2014,7 +2014,7 @@
     "    y_hat = a + b * floor_measure\n",
     "    \n",
     "    # Data likelihood\n",
-    "    y_like = Normal('y_like', mu=y_hat, sd=sigma_y, observed=log_radon)\n",
+    "    y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed=log_radon)\n",
     "\n",
     "model_to_graphviz(hierarchical_intercept)"
    ]
@@ -2230,17 +2230,17 @@
     "    sigma_a = HalfCauchy('sigma_a', 5)\n",
     "    \n",
     "    # County uranium model for slope\n",
-    "    gamma = Normal('gamma', mu=0., sd=1e5, shape=3)\n",
+    "    gamma = Normal('gamma', mu=0., sigma=1e5, shape=3)\n",
     "    \n",
     "    # Uranium model for intercept\n",
     "    mu_a = Deterministic('mu_a', gamma[0] + gamma[1]*u.values + gamma[2]*xbar[county])\n",
     "\n",
     "    # County variation not explained by uranium\n",
-    "    eps_a = Normal('eps_a', mu=0, sd=sigma_a, shape=counties)\n",
+    "    eps_a = Normal('eps_a', mu=0, sigma=sigma_a, shape=counties)\n",
     "    a = Deterministic('a', mu_a + eps_a[county])\n",
     "\n",
     "    # Common slope\n",
-    "    b = Normal('b', mu=0., sd=1e15)\n",
+    "    b = Normal('b', mu=0., sigma=1e15)\n",
     "    \n",
     "    # Model error\n",
     "    sigma_y = Uniform('sigma_y', lower=0, upper=100)\n",
@@ -2249,7 +2249,7 @@
     "    y_hat = a + b * floor_measure\n",
     "    \n",
     "    # Data likelihood\n",
-    "    y_like = Normal('y_like', mu=y_hat, sd=sigma_y, observed=log_radon)\n",
+    "    y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed=log_radon)\n",
     "\n",
     "model_to_graphviz(contextual_effect)"
    ]
@@ -2576,17 +2576,17 @@
     "    sigma_a = HalfCauchy('sigma_a', 5)\n",
     "    \n",
     "    # County uranium model for slope\n",
-    "    gamma = Normal('gamma', mu=0., sd=1e5, shape=3)\n",
+    "    gamma = Normal('gamma', mu=0., sigma=1e5, shape=3)\n",
     "    \n",
     "    # Uranium model for intercept\n",
     "    mu_a = Deterministic('mu_a', gamma[0] + gamma[1]*u.values + gamma[2]*xbar[county])\n",
     "\n",
     "    # County variation not explained by uranium\n",
-    "    eps_a = Normal('eps_a', mu=0, sd=sigma_a, shape=counties)\n",
+    "    eps_a = Normal('eps_a', mu=0, sigma=sigma_a, shape=counties)\n",
     "    a = Deterministic('a', mu_a + eps_a[county])\n",
     "\n",
     "    # Common slope\n",
-    "    b = Normal('b', mu=0., sd=1e15)\n",
+    "    b = Normal('b', mu=0., sigma=1e15)\n",
     "    \n",
     "    # Model error\n",
     "    sigma_y = Uniform('sigma_y', lower=0, upper=100)\n",
@@ -2595,10 +2595,10 @@
     "    y_hat = a + b * floor_measure\n",
     "    \n",
     "    # Data likelihood\n",
-    "    y_like = Normal('y_like', mu=y_hat, sd=sigma_y, observed=log_radon)\n",
+    "    y_like = Normal('y_like', mu=y_hat, sigma=sigma_y, observed=log_radon)\n",
     "    \n",
     "    # St Louis county prediction\n",
-    "    stl_pred = Normal('stl_pred', mu=a[69] + b, sd=sigma_y)\n",
+    "    stl_pred = Normal('stl_pred', mu=a[69] + b, sigma=sigma_y)\n",
     "\n",
     "model_to_graphviz(contextual_pred)"
    ]
diff --git a/docs/source/notebooks/posterior_predictive.ipynb b/docs/source/notebooks/posterior_predictive.ipynb
index 114efc1b85..2c93878819 100644
--- a/docs/source/notebooks/posterior_predictive.ipynb
+++ b/docs/source/notebooks/posterior_predictive.ipynb
@@ -75,9 +75,9 @@
     "data = np.random.randn(100)\n",
     "\n",
     "with pm.Model() as model: \n",
-    "    mu = pm.Normal('mu', mu=0, sd=1, testval=0)\n",
-    "    sd = pm.HalfNormal('sd', sd=1)\n",
-    "    n = pm.Normal('n', mu=mu, sd=sd, observed=data)\n",
+    "    mu = pm.Normal('mu', mu=0, sigma=1, testval=0)\n",
+    "    sd = pm.HalfNormal('sd', sigma=1)\n",
+    "    n = pm.Normal('n', mu=mu, sigma=sd, observed=data)\n",
     "    \n",
     "    trace = pm.sample(5000)"
    ]
@@ -283,7 +283,7 @@
    ],
    "source": [
     "with pm.Model() as model:\n",
-    "    coeff = pm.Normal('coeff', mu=0, sd=1)\n",
+    "    coeff = pm.Normal('coeff', mu=0, sigma=1)\n",
     "    outcome = pm.Bernoulli('outcome', logit_p=coeff*predictors_shared, observed=outcomes)    \n",
     "    trace = pm.sample(5000)"
    ]
diff --git a/docs/source/notebooks/rugby_analytics.ipynb b/docs/source/notebooks/rugby_analytics.ipynb
index 87dcab0d45..b0f2ce5a65 100644
--- a/docs/source/notebooks/rugby_analytics.ipynb
+++ b/docs/source/notebooks/rugby_analytics.ipynb
@@ -757,13 +757,13 @@
     "with pm.Model() as model:\n",
     "    # global model parameters\n",
     "    home = pm.Flat('home')\n",
-    "    sd_att = pm.HalfStudentT('sd_att', nu=3, sd=2.5)\n",
-    "    sd_def = pm.HalfStudentT('sd_def', nu=3, sd=2.5)\n",
+    "    sd_att = pm.HalfStudentT('sd_att', nu=3, sigma=2.5)\n",
+    "    sd_def = pm.HalfStudentT('sd_def', nu=3, sigma=2.5)\n",
     "    intercept = pm.Flat('intercept')\n",
     "    \n",
     "    # team-specific model parameters\n",
-    "    atts_star = pm.Normal(\"atts_star\", mu=0, sd=sd_att, shape=num_teams)\n",
-    "    defs_star = pm.Normal(\"defs_star\", mu=0, sd=sd_def, shape=num_teams)\n",
+    "    atts_star = pm.Normal(\"atts_star\", mu=0, sigma=sd_att, shape=num_teams)\n",
+    "    defs_star = pm.Normal(\"defs_star\", mu=0, sigma=sd_def, shape=num_teams)\n",
     " \n",
     "    atts = pm.Deterministic('atts', atts_star - tt.mean(atts_star))\n",
     "    defs = pm.Deterministic('defs', defs_star - tt.mean(defs_star))\n",
diff --git a/docs/source/notebooks/sampler-stats.ipynb b/docs/source/notebooks/sampler-stats.ipynb
index 53f5488c5d..c58c3b9a34 100644
--- a/docs/source/notebooks/sampler-stats.ipynb
+++ b/docs/source/notebooks/sampler-stats.ipynb
@@ -56,7 +56,7 @@
    "source": [
     "model = pm.Model()\n",
     "with model:\n",
-    "    mu1 = pm.Normal(\"mu1\", mu=0, sd=1, shape=10)"
+    "    mu1 = pm.Normal(\"mu1\", mu=0, sigma=1, shape=10)"
    ]
   },
   {
@@ -418,7 +418,7 @@
     "model = pm.Model()\n",
     "with model:\n",
     "    mu1 = pm.Bernoulli(\"mu1\", p=0.8)\n",
-    "    mu2 = pm.Normal(\"mu2\", mu=0, sd=1, shape=10)"
+    "    mu2 = pm.Normal(\"mu2\", mu=0, sigma=1, shape=10)"
    ]
   },
   {
diff --git a/docs/source/notebooks/sgfs_simple_optimization.ipynb b/docs/source/notebooks/sgfs_simple_optimization.ipynb
index 62cde11c12..9543496268 100644
--- a/docs/source/notebooks/sgfs_simple_optimization.ipynb
+++ b/docs/source/notebooks/sgfs_simple_optimization.ipynb
@@ -141,7 +141,7 @@
     "draws = 1000\n",
     "\n",
     "with pm.Model() as model:\n",
-    "    abc = pm.Normal('abc', sd=1, mu=1, shape=(3,))\n",
+    "    abc = pm.Normal('abc', sigma=1, mu=1, shape=(3,))\n",
     "    x = x_obs\n",
     "    x2 = x**2\n",
     "    o = tt.ones_like(x)\n",
diff --git a/docs/source/notebooks/stochastic_volatility.ipynb b/docs/source/notebooks/stochastic_volatility.ipynb
index 1f15e0b062..6d57a26919 100644
--- a/docs/source/notebooks/stochastic_volatility.ipynb
+++ b/docs/source/notebooks/stochastic_volatility.ipynb
@@ -130,7 +130,7 @@
    "source": [
     "with pm.Model() as model:\n",
     "    step_size = pm.Exponential('sigma', 50.)\n",
-    "    s = GaussianRandomWalk('s', sd=step_size, \n",
+    "    s = GaussianRandomWalk('s', sigma=step_size, \n",
     "                           shape=len(returns))\n",
     "    \n",
     "    nu = pm.Exponential('nu', .1)\n",
diff --git a/docs/source/notebooks/survival_analysis.ipynb b/docs/source/notebooks/survival_analysis.ipynb
index d8a3b5a67b..c2cf2a96d9 100644
--- a/docs/source/notebooks/survival_analysis.ipynb
+++ b/docs/source/notebooks/survival_analysis.ipynb
@@ -435,7 +435,7 @@
     "    \n",
     "    lambda0 = pm.Gamma('lambda0', 0.01, 0.01, shape=n_intervals)\n",
     "    \n",
-    "    beta = pm.Normal('beta', 0, sd=1000)\n",
+    "    beta = pm.Normal('beta', 0, sigma=1000)\n",
     "    \n",
     "    lambda_ = pm.Deterministic('lambda_', T.outer(T.exp(beta * df.metastized), lambda0))\n",
     "    mu = pm.Deterministic('mu', exposure * lambda_)\n",
diff --git a/docs/source/notebooks/updating_priors.ipynb b/docs/source/notebooks/updating_priors.ipynb
index 1aa4c96f7a..624b139fd2 100644
--- a/docs/source/notebooks/updating_priors.ipynb
+++ b/docs/source/notebooks/updating_priors.ipynb
@@ -88,7 +88,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Our initial beliefs about the parameters are quite informative (sd=1) and a bit off the true values."
+    "Our initial beliefs about the parameters are quite informative (sigma=1) and a bit off the true values."
    ]
   },
   {
@@ -114,15 +114,15 @@
     "with basic_model:\n",
     "    \n",
     "    # Priors for unknown model parameters\n",
-    "    alpha = Normal('alpha', mu=0, sd=1)\n",
-    "    beta0 = Normal('beta0', mu=12, sd=1)\n",
-    "    beta1 = Normal('beta1', mu=18, sd=1)\n",
+    "    alpha = Normal('alpha', mu=0, sigma=1)\n",
+    "    beta0 = Normal('beta0', mu=12, sigma=1)\n",
+    "    beta1 = Normal('beta1', mu=18, sigma=1)\n",
     "    \n",
     "    # Expected value of outcome\n",
     "    mu = alpha + beta0 * X1 + beta1 * X2\n",
     "    \n",
     "    # Likelihood (sampling distribution) of observations\n",
-    "    Y_obs = Normal('Y_obs', mu=mu, sd=1, observed=Y)\n",
+    "    Y_obs = Normal('Y_obs', mu=mu, sigma=1, observed=Y)\n",
     "    \n",
     "    # draw 1000 posterior samples\n",
     "    trace = sample(1000)"
@@ -278,7 +278,7 @@
     "        mu = alpha + beta0 * X1 + beta1 * X2\n",
     "\n",
     "        # Likelihood (sampling distribution) of observations\n",
-    "        Y_obs = Normal('Y_obs', mu=mu, sd=1, observed=Y)\n",
+    "        Y_obs = Normal('Y_obs', mu=mu, sigma=1, observed=Y)\n",
     "                \n",
     "        # draw 10000 posterior samples\n",
     "        trace = sample(1000)\n",
diff --git a/docs/source/notebooks/variational_api_quickstart.ipynb b/docs/source/notebooks/variational_api_quickstart.ipynb
index 78ec7a37c0..319ed5c811 100644
--- a/docs/source/notebooks/variational_api_quickstart.ipynb
+++ b/docs/source/notebooks/variational_api_quickstart.ipynb
@@ -56,7 +56,7 @@
     "sd = pm.floatX([.1, .1])\n",
     "\n",
     "with pm.Model() as model:\n",
-    "    x = pm.NormalMixture('x', w=w, mu=mu, sd=sd, dtype=theano.config.floatX)\n",
+    "    x = pm.NormalMixture('x', w=w, mu=mu, sigma=sd, dtype=theano.config.floatX)\n",
     "    x2 = x ** 2\n",
     "    sin_x = pm.math.sin(x)"
    ]
@@ -144,7 +144,7 @@
    "source": [
     "with pm.Model() as model:\n",
     "    \n",
-    "    x = pm.NormalMixture('x', w=w, mu=mu, sd=sd, dtype=theano.config.floatX)\n",
+    "    x = pm.NormalMixture('x', w=w, mu=mu, sigma=sd, dtype=theano.config.floatX)\n",
     "    x2 = x ** 2\n",
     "    sin_x = pm.math.sin(x)"
    ]
@@ -1125,7 +1125,7 @@
     "with pm.Model() as iris_model:\n",
     "    \n",
     "    # Coefficients for features\n",
-    "    β = pm.Normal('β', 0, sd=1e2, shape=(4, 3))\n",
+    "    β = pm.Normal('β', 0, sigma=1e2, shape=(4, 3))\n",
     "    # Transoform to unit interval\n",
     "    a = pm.Flat('a', shape=(3,))\n",
     "    p = tt.nnet.softmax(Xt.dot(β) + a)\n",
diff --git a/docs/source/notebooks/weibull_aft.ipynb b/docs/source/notebooks/weibull_aft.ipynb
index 2ce779eb58..a5934d2b58 100644
--- a/docs/source/notebooks/weibull_aft.ipynb
+++ b/docs/source/notebooks/weibull_aft.ipynb
@@ -151,8 +151,8 @@
     "with pm.Model() as model_1:\n",
     "    alpha_sd = 10.0\n",
     "\n",
-    "    mu = pm.Normal('mu', mu=0, sd=100)\n",
-    "    alpha_raw = pm.Normal('a0', mu=0, sd=0.1)\n",
+    "    mu = pm.Normal('mu', mu=0, sigma=100)\n",
+    "    alpha_raw = pm.Normal('a0', mu=0, sigma=0.1)\n",
     "    alpha = pm.Deterministic('alpha', tt.exp(alpha_sd * alpha_raw))\n",
     "    beta = pm.Deterministic('beta', tt.exp(mu / alpha))\n",
     "    \n",
@@ -305,7 +305,7 @@
    "outputs": [],
    "source": [
     "with pm.Model() as model_2:\n",
-    "    alpha = pm.Normal('alpha', mu=0, sd=10)\n",
+    "    alpha = pm.Normal('alpha', mu=0, sigma=10)\n",
     "    r = pm.Gamma('r', alpha=1, beta=0.001, testval=0.25)\n",
     "    beta = pm.Deterministic('beta', tt.exp(-alpha / r))\n",
     "\n",
@@ -467,7 +467,7 @@
    "source": [
     "with pm.Model() as model_3:\n",
     "    s = pm.HalfNormal('s', tau=5.0)\n",
-    "    gamma = pm.Normal('gamma', mu=0, sd=5)\n",
+    "    gamma = pm.Normal('gamma', mu=0, sigma=5)\n",
     "\n",
     "    y_obs = pm.Gumbel('y_obs', mu=gamma, beta=s, observed=logtime[~censored])\n",
     "    y_cens = pm.Potential('y_cens', gumbel_sf(y=logtime[censored], mu=gamma, sigma=s))"
diff --git a/pymc3/distributions/bound.py b/pymc3/distributions/bound.py
index 921480c037..b1724e7eba 100644
--- a/pymc3/distributions/bound.py
+++ b/pymc3/distributions/bound.py
@@ -182,15 +182,15 @@ class Bound:
 
         with pm.Model():
             NegativeNormal = pm.Bound(pm.Normal, upper=0.0)
-            par1 = NegativeNormal('par`', mu=0.0, sd=1.0, testval=-0.5)
+            par1 = NegativeNormal('par`', mu=0.0, sigma=1.0, testval=-0.5)
             # you can use the Bound object multiple times to
             # create multiple bounded random variables
-            par1_1 = NegativeNormal('par1_1', mu=-1.0, sd=1.0, testval=-1.5)
+            par1_1 = NegativeNormal('par1_1', mu=-1.0, sigma=1.0, testval=-1.5)
 
             # you can also define a Bound implicitly, while applying
             # it to a random variable
             par2 = pm.Bound(pm.Normal, lower=-1.0, upper=1.0)(
-                    'par2', mu=0.0, sd=1.0, testval=1.0)
+                    'par2', mu=0.0, sigma=1.0, testval=1.0)
     """
 
     def __init__(self, distribution, lower=None, upper=None):
diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index c5184849f4..52e8669f34 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -89,7 +89,7 @@ def assert_negative_support(var, label, distname, value=-1e-6):
         warnings.warn(msg)
 
 
-def get_tau_sd(tau=None, sd=None):
+def get_tau_sigma(tau=None, sigma=None):
     """
     Find precision and standard deviation. The link between the two
     parameterizations is given by the inverse relationship:
@@ -100,35 +100,35 @@ def get_tau_sd(tau=None, sd=None):
     Parameters
     ----------
     tau : array-like, optional
-    sd : array-like, optional
+    sigma : array-like, optional
 
     Results
     -------
-    Returns tuple (tau, sd)
+    Returns tuple (tau, sigma)
 
     Notes
     -----
-    If neither tau nor sd is provided, returns (1., 1.)
+    If neither tau nor sigma is provided, returns (1., 1.)
     """
     if tau is None:
-        if sd is None:
-            sd = 1.
+        if sigma is None:
+            sigma = 1.
             tau = 1.
         else:
-            tau = sd**-2.
+            tau = sigma**-2.
 
     else:
-        if sd is not None:
-            raise ValueError("Can't pass both tau and sd")
+        if sigma is not None:
+            raise ValueError("Can't pass both tau and sigma")
         else:
-            sd = tau**-.5
+            sigma = tau**-.5
 
-    # cast tau and sd to float in a way that works for both np.arrays
+    # cast tau and sigma to float in a way that works for both np.arrays
     # and pure python
     tau = 1. * tau
-    sd = 1. * sd
+    sigma = 1. * sigma
 
-    return floatX(tau), floatX(sd)
+    return floatX(tau), floatX(sigma)
 
 
 class Uniform(BoundedContinuous):
@@ -383,10 +383,10 @@ class Normal(Continuous):
         plt.style.use('seaborn-darkgrid')
         x = np.linspace(-5, 5, 1000)
         mus = [0., 0., 0., -2.]
-        sds = [0.4, 1., 2., 0.4]
-        for mu, sd in zip(mus, sds):
-            pdf = st.norm.pdf(x, mu, sd)
-            plt.plot(x, pdf, label=r'$\mu$ = {}, $\sigma$ = {}'.format(mu, sd))
+        sigmas = [0.4, 1., 2., 0.4]
+        for mu, sigma in zip(mus, sigmas):
+            pdf = st.norm.pdf(x, mu, sigma)
+            plt.plot(x, pdf, label=r'$\mu$ = {}, $\sigma$ = {}'.format(mu, sigma))
         plt.xlabel('x', fontsize=12)
         plt.ylabel('f(x)', fontsize=12)
         plt.legend(loc=1)
@@ -402,31 +402,33 @@ class Normal(Continuous):
     ----------
     mu : float
         Mean.
-    sd : float
-        Standard deviation (sd > 0) (only required if tau is not specified).
+    sigma : float
+        Standard deviation (sigma > 0) (only required if tau is not specified).
     tau : float
-        Precision (tau > 0) (only required if sd is not specified).
+        Precision (tau > 0) (only required if sigma is not specified).
 
     Examples
     --------
     .. code-block:: python
 
         with pm.Model():
-            x = pm.Normal('x', mu=0, sd=10)
+            x = pm.Normal('x', mu=0, sigma=10)
 
         with pm.Model():
             x = pm.Normal('x', mu=0, tau=1/23)
     """
 
-    def __init__(self, mu=0, sd=None, tau=None, **kwargs):
-        tau, sd = get_tau_sd(tau=tau, sd=sd)
-        self.sd = tt.as_tensor_variable(sd)
+    def __init__(self, mu=0, sigma=None, tau=None, sd=None, **kwargs):
+        if sd is not None:
+            sigma = sd
+        tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
+        self.sigma = self.sd = tt.as_tensor_variable(sigma)
         self.tau = tt.as_tensor_variable(tau)
 
         self.mean = self.median = self.mode = self.mu = mu = tt.as_tensor_variable(mu)
         self.variance = 1. / self.tau
 
-        assert_negative_support(sd, 'sd', 'Normal')
+        assert_negative_support(sigma, 'sigma', 'Normal')
         assert_negative_support(tau, 'tau', 'Normal')
 
         super().__init__(**kwargs)
@@ -448,7 +450,7 @@ def random(self, point=None, size=None):
         -------
         array
         """
-        mu, tau, _ = draw_values([self.mu, self.tau, self.sd],
+        mu, tau, _ = draw_values([self.mu, self.tau, self.sigma],
                                  point=point, size=size)
         return generate_samples(stats.norm.rvs, loc=mu, scale=tau**-0.5,
                                 dist_shape=self.shape,
@@ -468,25 +470,25 @@ def logp(self, value):
         -------
         TensorVariable
         """
-        sd = self.sd
+        sigma = self.sigma
         tau = self.tau
         mu = self.mu
 
         return bound((-tau * (value - mu)**2 + tt.log(tau / np.pi / 2.)) / 2.,
-                     sd > 0)
+                     sigma > 0)
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
             dist = self
-        sd = dist.sd
+        sigma = dist.sigma
         mu = dist.mu
         name = r'\text{%s}' % name
-        return r'${} \sim \text{{Normal}}(\mathit{{mu}}={},~\mathit{{sd}}={})$'.format(name,
+        return r'${} \sim \text{{Normal}}(\mathit{{mu}}={},~\mathit{{sigma}}={})$'.format(name,
                                                                 get_variable_name(mu),
-                                                                get_variable_name(sd))
+                                                                get_variable_name(sigma))
 
     def logcdf(self, value):
-        return normal_lcdf(self.mu, self.sd, value)
+        return normal_lcdf(self.mu, self.sigma, value)
 
 
 class TruncatedNormal(BoundedContinuous):
@@ -517,14 +519,14 @@ class TruncatedNormal(BoundedContinuous):
         plt.style.use('seaborn-darkgrid')
         x = np.linspace(-10, 10, 1000)
         mus = [0.,  0., 0.]
-        sds = [3.,5.,7.]
+        sigmas = [3.,5.,7.]
         a1 = [-3, -5, -5]
         b1 = [7, 5, 4]
-        for mu, sd, a, b in zip(mus, sds,a1,b1):
-            print mu, sd, a, b
-            an, bn = (a - mu) / sd, (b - mu) / sd
-            pdf = st.truncnorm.pdf(x, an,bn, loc=mu, scale=sd)
-            plt.plot(x, pdf, label=r'$\mu$ = {}, $\sigma$ = {}, a={}, b={}'.format(mu, sd, a, b))
+        for mu, sigma, a, b in zip(mus, sigmas,a1,b1):
+            print mu, sigma, a, b
+            an, bn = (a - mu) / sigma, (b - mu) / sigma
+            pdf = st.truncnorm.pdf(x, an,bn, loc=mu, scale=sigma)
+            plt.plot(x, pdf, label=r'$\mu$ = {}, $\sigma$ = {}, a={}, b={}'.format(mu, sigma, a, b))
         plt.xlabel('x', fontsize=12)
         plt.ylabel('f(x)', fontsize=12)
         plt.legend(loc=1)
@@ -541,8 +543,8 @@ class TruncatedNormal(BoundedContinuous):
     ----------
     mu : float
         Mean.
-    sd : float
-        Standard deviation (sd > 0).
+    sigma : float
+        Standard deviation (sigma > 0).
     lower : float (optional)
         Left bound.
     upper : float (optional)
@@ -553,20 +555,22 @@ class TruncatedNormal(BoundedContinuous):
     .. code-block:: python
 
         with pm.Model():
-            x = pm.TruncatedNormal('x', mu=0, sd=10, lower=0)
+            x = pm.TruncatedNormal('x', mu=0, sigma=10, lower=0)
 
         with pm.Model():
-            x = pm.TruncatedNormal('x', mu=0, sd=10, upper=1)
+            x = pm.TruncatedNormal('x', mu=0, sigma=10, upper=1)
 
         with pm.Model():
-            x = pm.TruncatedNormal('x', mu=0, sd=10, lower=0, upper=1)
+            x = pm.TruncatedNormal('x', mu=0, sigma=10, lower=0, upper=1)
 
     """
 
-    def __init__(self, mu=0, sd=None, tau=None, lower=None, upper=None,
-                 transform='auto', *args, **kwargs):
-        tau, sd = get_tau_sd(tau=tau, sd=sd)
-        self.sd = tt.as_tensor_variable(sd)
+    def __init__(self, mu=0, sigma=None, tau=None, lower=None, upper=None,
+                 transform='auto', sd=None, *args, **kwargs):
+        if sd is not None:
+            sigma = sd
+        tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
+        self.sigma = self.sd = tt.as_tensor_variable(sigma)
         self.tau = tt.as_tensor_variable(tau)
         self.lower = tt.as_tensor_variable(lower) if lower is not None else lower
         self.upper = tt.as_tensor_variable(upper) if upper is not None else upper
@@ -581,7 +585,7 @@ def __init__(self, mu=0, sd=None, tau=None, lower=None, upper=None,
         else:
             self._defaultval = (self.lower + self.upper) / 2
 
-        assert_negative_support(sd, 'sd', 'TruncatedNormal')
+        assert_negative_support(sigma, 'sigma', 'TruncatedNormal')
         assert_negative_support(tau, 'tau', 'TruncatedNormal')
 
         super().__init__(defaults=('_defaultval',), transform=transform,
@@ -605,7 +609,7 @@ def random(self, point=None, size=None):
         array
         """
         mu_v, std_v, a_v, b_v = draw_values(
-            [self.mu, self.sd, self.lower, self.upper], point=point, size=size)
+            [self.mu, self.sigma, self.lower, self.upper], point=point, size=size)
         return generate_samples(stats.truncnorm.rvs,
                                 a=(a_v - mu_v)/std_v,
                                 b=(b_v - mu_v) / std_v,
@@ -630,12 +634,12 @@ def logp(self, value):
         TensorVariable
         """
         mu = self.mu
-        sd = self.sd
+        sigma = self.sigma
 
         norm = self._normalization()
-        logp = Normal.dist(mu=mu, sd=sd).logp(value) - norm
+        logp = Normal.dist(mu=mu, sigma=sigma).logp(value) - norm
 
-        bounds = [sd > 0]
+        bounds = [sigma > 0]
         if self.lower is not None:
             bounds.append(value >= self.lower)
         if self.upper is not None:
@@ -643,16 +647,16 @@ def logp(self, value):
         return bound(logp, *bounds)
 
     def _normalization(self):
-        mu, sd = self.mu, self.sd
+        mu, sigma = self.mu, self.sigma
 
         if self.lower is None and self.upper is None:
             return 0.
 
         if self.lower is not None and self.upper is not None:
-            lcdf_a = normal_lcdf(mu, sd, self.lower)
-            lcdf_b = normal_lcdf(mu, sd, self.upper)
-            lsf_a = normal_lccdf(mu, sd, self.lower)
-            lsf_b = normal_lccdf(mu, sd, self.upper)
+            lcdf_a = normal_lcdf(mu, sigma, self.lower)
+            lcdf_b = normal_lcdf(mu, sigma, self.upper)
+            lsf_a = normal_lccdf(mu, sigma, self.lower)
+            lsf_b = normal_lccdf(mu, sigma, self.upper)
 
             return tt.switch(
                 self.lower > 0,
@@ -661,9 +665,9 @@ def _normalization(self):
             )
 
         if self.lower is not None:
-            return normal_lccdf(mu, sd, self.lower)
+            return normal_lccdf(mu, sigma, self.lower)
         else:
-            return normal_lcdf(mu, sd, self.upper)
+            return normal_lcdf(mu, sigma, self.upper)
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
@@ -671,11 +675,11 @@ def _repr_latex_(self, name=None, dist=None):
         name = r'\text{%s}' % name
         return (
             r'${} \sim \text{{TruncatedNormal}}('
-            '\mathit{{mu}}={},~\mathit{{sd}}={},a={},b={})$'
+            '\mathit{{mu}}={},~\mathit{{sigma}}={},a={},b={})$'
             .format(
                 name,
                 get_variable_name(self.mu),
-                get_variable_name(self.sd),
+                get_variable_name(self.sigma),
                 get_variable_name(self.lower),
                 get_variable_name(self.upper),
             )
@@ -713,9 +717,9 @@ class HalfNormal(PositiveContinuous):
         import scipy.stats as st
         plt.style.use('seaborn-darkgrid')
         x = np.linspace(0, 5, 200)
-        for sd in [0.4, 1., 2.]:
-            pdf = st.halfnorm.pdf(x, scale=sd)
-            plt.plot(x, pdf, label=r'$\sigma$ = {}'.format(sd))
+        for sigma in [0.4, 1., 2.]:
+            pdf = st.halfnorm.pdf(x, scale=sigma)
+            plt.plot(x, pdf, label=r'$\sigma$ = {}'.format(sigma))
         plt.xlabel('x', fontsize=12)
         plt.ylabel('f(x)', fontsize=12)
         plt.legend(loc=1)
@@ -729,34 +733,37 @@ class HalfNormal(PositiveContinuous):
 
     Parameters
     ----------
-    sd : float
-        Scale parameter :math:`sigma` (``sd`` > 0) (only required if ``tau`` is not specified).
+    sigma : float
+        Scale parameter :math:`sigma` (``sigma`` > 0) (only required if ``tau`` is not specified).
     tau : float
-        Precision :math:`tau` (tau > 0) (only required if sd is not specified).
+        Precision :math:`tau` (tau > 0) (only required if sigma is not specified).
 
     Examples
     --------
     .. code-block:: python
 
         with pm.Model():
-            x = pm.HalfNormal('x', sd=10)
+            x = pm.HalfNormal('x', sigma=10)
 
         with pm.Model():
             x = pm.HalfNormal('x', tau=1/15)
     """
 
-    def __init__(self, sd=None, tau=None, *args, **kwargs):
+    def __init__(self, sigma=None, tau=None, sd=None, *args, **kwargs):
+        if sd is not None:
+            sigma = sd
+
         super().__init__(*args, **kwargs)
-        tau, sd = get_tau_sd(tau=tau, sd=sd)
+        tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
-        self.sd = sd = tt.as_tensor_variable(sd)
+        self.sigma = self.sd = sigma = tt.as_tensor_variable(sigma)
         self.tau = tau = tt.as_tensor_variable(tau)
 
         self.mean = tt.sqrt(2 / (np.pi * self.tau))
         self.variance = (1. - 2 / np.pi) / self.tau
 
         assert_negative_support(tau, 'tau', 'HalfNormal')
-        assert_negative_support(sd, 'sd', 'HalfNormal')
+        assert_negative_support(sigma, 'sigma', 'HalfNormal')
 
     def random(self, point=None, size=None):
         """
@@ -775,8 +782,8 @@ def random(self, point=None, size=None):
         -------
         array
         """
-        sd = draw_values([self.sd], point=point)[0]
-        return generate_samples(stats.halfnorm.rvs, loc=0., scale=sd,
+        sigma = draw_values([self.sigma], point=point)[0]
+        return generate_samples(stats.halfnorm.rvs, loc=0., scale=sigma,
                                 dist_shape=self.shape,
                                 size=size)
 
@@ -795,22 +802,22 @@ def logp(self, value):
         TensorVariable
         """
         tau = self.tau
-        sd = self.sd
+        sigma = self.sigma
         return bound(-0.5 * tau * value**2 + 0.5 * tt.log(tau * 2. / np.pi),
                      value >= 0,
-                     tau > 0, sd > 0)
+                     tau > 0, sigma > 0)
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
             dist = self
-        sd = dist.sd
+        sigma = dist.sigma
         name = r'\text{%s}' % name
-        return r'${} \sim \text{{HalfNormal}}(\mathit{{sd}}={})$'.format(name,
-                                                                         get_variable_name(sd))
+        return r'${} \sim \text{{HalfNormal}}(\mathit{{sigma}}={})$'.format(name,
+                                                                         get_variable_name(sigma))
 
     def logcdf(self, value):
-        sd = self.sd
-        z = zvalue(value, mu=0, sd=sd)
+        sigma = self.sigma
+        z = zvalue(value, mu=0, sigma=sigma)
         return tt.switch(
             tt.lt(z, -1.0),
             tt.log(tt.erfcx(-z / tt.sqrt(2.))) - tt.sqr(z),
@@ -1098,8 +1105,8 @@ class Beta(UnitContinuous):
         beta > 0.
     mu : float
         Alternative mean (0 < mu < 1).
-    sd : float
-        Alternative standard deviation (0 < sd < sqrt(mu * (1 - mu))).
+    sigma : float
+        Alternative standard deviation (0 < sigma < sqrt(mu * (1 - mu))).
 
     Notes
     -----
@@ -1107,11 +1114,12 @@ class Beta(UnitContinuous):
     the binomial distribution.
     """
 
-    def __init__(self, alpha=None, beta=None, mu=None, sd=None,
-                 *args, **kwargs):
+    def __init__(self, alpha=None, beta=None, mu=None, sigma=None,
+                 sd=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
-
-        alpha, beta = self.get_alpha_beta(alpha, beta, mu, sd)
+        if sd is not None:
+            sigma = sd
+        alpha, beta = self.get_alpha_beta(alpha, beta, mu, sigma)
         self.alpha = alpha = tt.as_tensor_variable(alpha)
         self.beta = beta = tt.as_tensor_variable(beta)
 
@@ -1122,16 +1130,16 @@ def __init__(self, alpha=None, beta=None, mu=None, sd=None,
         assert_negative_support(alpha, 'alpha', 'Beta')
         assert_negative_support(beta, 'beta', 'Beta')
 
-    def get_alpha_beta(self, alpha=None, beta=None, mu=None, sd=None):
+    def get_alpha_beta(self, alpha=None, beta=None, mu=None, sigma=None):
         if (alpha is not None) and (beta is not None):
             pass
-        elif (mu is not None) and (sd is not None):
-            kappa = mu * (1 - mu) / sd**2 - 1
+        elif (mu is not None) and (sigma is not None):
+            kappa = mu * (1 - mu) / sigma**2 - 1
             alpha = mu * kappa
             beta = (1 - mu) * kappa
         else:
             raise ValueError('Incompatible parameterization. Either use alpha '
-                             'and beta, or mu and sd to specify distribution.')
+                             'and beta, or mu and sigma to specify distribution.')
 
         return alpha, beta
 
@@ -1588,10 +1596,10 @@ class Lognormal(PositiveContinuous):
         plt.style.use('seaborn-darkgrid')
         x = np.linspace(0, 3, 100)
         mus = [0., 0., 0.]
-        sds = [.25, .5, 1.]
-        for mu, sd in zip(mus, sds):
-            pdf = st.lognorm.pdf(x, sd, scale=np.exp(mu))
-            plt.plot(x, pdf, label=r'$\mu$ = {}, $\sigma$ = {}'.format(mu, sd))
+        sigmas = [.25, .5, 1.]
+        for mu, sigma in zip(mus, sigmas):
+            pdf = st.lognorm.pdf(x, sigma, scale=np.exp(mu))
+            plt.plot(x, pdf, label=r'$\mu$ = {}, $\sigma$ = {}'.format(mu, sigma))
         plt.xlabel('x', fontsize=12)
         plt.ylabel('f(x)', fontsize=12)
         plt.legend(loc=1)
@@ -1607,30 +1615,33 @@ class Lognormal(PositiveContinuous):
     ----------
     mu : float
         Location parameter.
-    sd : float
-        Standard deviation. (sd > 0). (only required if tau is not specified).
+    sigma : float
+        Standard deviation. (sigma > 0). (only required if tau is not specified).
     tau : float
-        Scale parameter (tau > 0). (only required if sd is not specified).
+        Scale parameter (tau > 0). (only required if sigma is not specified).
 
     Example
     -------
     .. code-block:: python
 
-        # Example to show that we pass in only `sd` or `tau` but not both.
+        # Example to show that we pass in only `sigma` or `tau` but not both.
         with pm.Model():
-            x = pm.Lognormal('x', mu=2, sd=30)
+            x = pm.Lognormal('x', mu=2, sigma=30)
 
         with pm.Model():
             x = pm.Lognormal('x', mu=2, tau=1/100)
     """
 
-    def __init__(self, mu=0, sd=None, tau=None, *args, **kwargs):
+    def __init__(self, mu=0, sigma=None, tau=None, sd=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        tau, sd = get_tau_sd(tau=tau, sd=sd)
+        if sd is not None:
+            sigma = sd
+
+        tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
         self.mu = mu = tt.as_tensor_variable(mu)
         self.tau = tau = tt.as_tensor_variable(tau)
-        self.sd = sd = tt.as_tensor_variable(sd)
+        self.sigma = self.sd = sigma = tt.as_tensor_variable(sigma)
 
         self.mean = tt.exp(self.mu + 1. / (2 * self.tau))
         self.median = tt.exp(self.mu)
@@ -1638,7 +1649,7 @@ def __init__(self, mu=0, sd=None, tau=None, *args, **kwargs):
         self.variance = (tt.exp(1. / self.tau) - 1) * tt.exp(2 * self.mu + 1. / self.tau)
 
         assert_negative_support(tau, 'tau', 'Lognormal')
-        assert_negative_support(sd, 'sd', 'Lognormal')
+        assert_negative_support(sigma, 'sigma', 'Lognormal')
 
     def _random(self, mu, tau, size=None):
         samples = np.random.normal(size=size)
@@ -1700,8 +1711,8 @@ def _repr_latex_(self, name=None, dist=None):
 
     def logcdf(self, value):
         mu = self.mu
-        sd = self.sd
-        z = zvalue(tt.log(value), mu=mu, sd=sd)
+        sigma = self.sigma
+        z = zvalue(tt.log(value), mu=mu, sigma=sigma)
 
         return tt.switch(
             tt.le(value, 0),
@@ -1740,11 +1751,11 @@ class StudentT(Continuous):
         plt.style.use('seaborn-darkgrid')
         x = np.linspace(-8, 8, 200)
         mus = [0., 0., -2., -2.]
-        sds = [1., 1., 1., 2.]
+        sigmas = [1., 1., 1., 2.]
         dfs = [1., 5., 5., 5.]
-        for mu, sd, df in zip(mus, sds, dfs):
-            pdf = st.t.pdf(x, df, loc=mu, scale=sd)
-            plt.plot(x, pdf, label=r'$\mu$ = {}, $\sigma$ = {}, $\nu$ = {}'.format(mu, sd, df))
+        for mu, sigma, df in zip(mus, sigmas, dfs):
+            pdf = st.t.pdf(x, df, loc=mu, scale=sigma)
+            plt.plot(x, pdf, label=r'$\mu$ = {}, $\sigma$ = {}, $\nu$ = {}'.format(mu, sigma, df))
         plt.xlabel('x', fontsize=12)
         plt.ylabel('f(x)', fontsize=12)
         plt.legend(loc=1)
@@ -1760,37 +1771,40 @@ class StudentT(Continuous):
         Degrees of freedom, also known as normality parameter (nu > 0).
     mu : float
         Location parameter.
-    sd : float
-        Scale parameter (sd > 0). Converges to the standard deviation as nu
+    sigma : float
+        Scale parameter (sigma > 0). Converges to the standard deviation as nu
         increases. (only required if lam is not specified)
     lam : float
         Scale parameter (lam > 0). Converges to the precision as nu
-        increases. (only required if sd is not specified)
+        increases. (only required if sigma is not specified)
 
     Examples
     --------
     .. code-block:: python
 
         with pm.Model():
-            x = pm.StudentT('x', nu=15, mu=0, sd=10)
+            x = pm.StudentT('x', nu=15, mu=0, sigma=10)
 
         with pm.Model():
             x = pm.StudentT('x', nu=15, mu=0, lam=1/23)
     """
 
-    def __init__(self, nu, mu=0, lam=None, sd=None, *args, **kwargs):
+    def __init__(self, nu, mu=0, lam=None, sigma=None, sd=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        if sd is not None:
+            sigma = sd
+
         self.nu = nu = tt.as_tensor_variable(nu)
-        lam, sd = get_tau_sd(tau=lam, sd=sd)
+        lam, sigma = get_tau_sigma(tau=lam, sigma=sigma)
         self.lam = lam = tt.as_tensor_variable(lam)
-        self.sd = sd = tt.as_tensor_variable(sd)
+        self.sigma = self.sd = sigma = tt.as_tensor_variable(sigma)
         self.mean = self.median = self.mode = self.mu = mu = tt.as_tensor_variable(mu)
 
         self.variance = tt.switch((nu > 2) * 1,
                                   (1 / self.lam) * (nu / (nu - 2)),
                                   np.inf)
 
-        assert_negative_support(lam, 'lam (sd)', 'StudentT')
+        assert_negative_support(lam, 'lam (sigma)', 'StudentT')
         assert_negative_support(nu, 'nu', 'StudentT')
 
     def random(self, point=None, size=None):
@@ -1833,13 +1847,13 @@ def logp(self, value):
         nu = self.nu
         mu = self.mu
         lam = self.lam
-        sd = self.sd
+        sigma = self.sigma
 
         return bound(gammaln((nu + 1.0) / 2.0)
                      + .5 * tt.log(lam / (nu * np.pi))
                      - gammaln(nu / 2.0)
                      - (nu + 1.0) / 2.0 * tt.log1p(lam * (value - mu)**2 / nu),
-                     lam > 0, nu > 0, sd > 0)
+                     lam > 0, nu > 0, sigma > 0)
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
@@ -1856,8 +1870,8 @@ def _repr_latex_(self, name=None, dist=None):
     def logcdf(self, value):
         nu = self.nu
         mu = self.mu
-        sd = self.sd
-        t = (value - mu)/sd
+        sigma = self.sigma
+        t = (value - mu)/sigma
         sqrt_t2_nu = tt.sqrt(t**2 + nu)
         x = (t + sqrt_t2_nu)/(2.0 * sqrt_t2_nu)
         return tt.log(incomplete_beta(nu/2., nu/2., x))
@@ -2278,14 +2292,17 @@ class Gamma(PositiveContinuous):
         Rate parameter (beta > 0).
     mu : float
         Alternative shape parameter (mu > 0).
-    sd : float
-        Alternative scale parameter (sd > 0).
+    sigma : float
+        Alternative scale parameter (sigma > 0).
     """
 
-    def __init__(self, alpha=None, beta=None, mu=None, sd=None,
-                 *args, **kwargs):
+    def __init__(self, alpha=None, beta=None, mu=None, sigma=None,
+                 sd=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        alpha, beta = self.get_alpha_beta(alpha, beta, mu, sd)
+        if sd is not None:
+            sigma = sd
+
+        alpha, beta = self.get_alpha_beta(alpha, beta, mu, sigma)
         self.alpha = alpha = tt.as_tensor_variable(alpha)
         self.beta = beta = tt.as_tensor_variable(beta)
         self.mean = alpha / beta
@@ -2295,15 +2312,15 @@ def __init__(self, alpha=None, beta=None, mu=None, sd=None,
         assert_negative_support(alpha, 'alpha', 'Gamma')
         assert_negative_support(beta, 'beta', 'Gamma')
 
-    def get_alpha_beta(self, alpha=None, beta=None, mu=None, sd=None):
+    def get_alpha_beta(self, alpha=None, beta=None, mu=None, sigma=None):
         if (alpha is not None) and (beta is not None):
             pass
-        elif (mu is not None) and (sd is not None):
-            alpha = mu**2 / sd**2
-            beta = mu / sd**2
+        elif (mu is not None) and (sigma is not None):
+            alpha = mu**2 / sigma**2
+            beta = mu / sigma**2
         else:
             raise ValueError('Incompatible parameterization. Either use '
-                             'alpha and beta, or mu and sd to specify '
+                             'alpha and beta, or mu and sigma to specify '
                              'distribution.')
 
         return alpha, beta
@@ -2409,14 +2426,18 @@ class InverseGamma(PositiveContinuous):
         Scale parameter (beta > 0).
     mu : float
         Alternative shape parameter (mu > 0).
-    sd : float
-        Alternative scale parameter (sd > 0).
+    sigma : float
+        Alternative scale parameter (sigma > 0).
     """
 
-    def __init__(self, alpha=None, beta=None, mu=None, sd=None, *args, **kwargs):
+    def __init__(self, alpha=None, beta=None, mu=None, sigma=None, sd=None,
+                 *args, **kwargs):
         super().__init__(*args, defaults=('mode',), **kwargs)
 
-        alpha, beta = InverseGamma._get_alpha_beta(alpha, beta, mu, sd)
+        if sd is not None:
+            sigma = sd
+
+        alpha, beta = InverseGamma._get_alpha_beta(alpha, beta, mu, sigma)
         self.alpha = alpha = tt.as_tensor_variable(alpha)
         self.beta = beta = tt.as_tensor_variable(beta)
 
@@ -2437,18 +2458,18 @@ def _calculate_mean(self):
             return m
 
     @staticmethod
-    def _get_alpha_beta(alpha, beta, mu, sd):
+    def _get_alpha_beta(alpha, beta, mu, sigma):
         if (alpha is not None):
             if (beta is not None):
                 pass
             else:
                 beta = 1
-        elif (mu is not None) and (sd is not None):
-            alpha = (2 * sd**2 + mu**2)/sd**2
-            beta = mu * (mu**2 + sd**2) / sd**2
+        elif (mu is not None) and (sigma is not None):
+            alpha = (2 * sigma**2 + mu**2)/sigma**2
+            beta = mu * (mu**2 + sigma**2) / sigma**2
         else:
             raise ValueError('Incompatible parameterization. Either use '
-                             'alpha and (optionally) beta, or mu and sd to specify '
+                             'alpha and (optionally) beta, or mu and sigma to specify '
                              'distribution.')
 
         return alpha, beta
@@ -2737,35 +2758,39 @@ class HalfStudentT(PositiveContinuous):
     ----------
     nu : float
         Degrees of freedom, also known as normality parameter (nu > 0).
-    sd : float
-        Scale parameter (sd > 0). Converges to the standard deviation as nu
+    sigma : float
+        Scale parameter (sigma > 0). Converges to the standard deviation as nu
         increases. (only required if lam is not specified)
     lam : float
         Scale parameter (lam > 0). Converges to the precision as nu
-        increases. (only required if sd is not specified)
+        increases. (only required if sigma is not specified)
 
     Examples
     --------
     .. code-block:: python
 
-        # Only pass in one of lam or sd, but not both.
+        # Only pass in one of lam or sigma, but not both.
         with pm.Model():
-            x = pm.HalfStudentT('x', sd=10, nu=10)
+            x = pm.HalfStudentT('x', sigma=10, nu=10)
 
         with pm.Model():
             x = pm.HalfStudentT('x', lam=4, nu=10)
     """
 
-    def __init__(self, nu=1, sd=None, lam=None, *args, **kwargs):
+    def __init__(self, nu=1, sigma=None, lam=None, sd=None,
+                 *args, **kwargs):
         super().__init__(*args, **kwargs)
+        if sd is not None:
+            sigma = sd
+
         self.mode = tt.as_tensor_variable(0)
-        lam, sd = get_tau_sd(lam, sd)
-        self.median = tt.as_tensor_variable(sd)
-        self.sd = tt.as_tensor_variable(sd)
+        lam, sigma = get_tau_sigma(lam, sigma)
+        self.median = tt.as_tensor_variable(sigma)
+        self.sigma = self.sd = tt.as_tensor_variable(sigma)
         self.lam = tt.as_tensor_variable(lam)
         self.nu = nu = tt.as_tensor_variable(nu)
 
-        assert_negative_support(sd, 'sd', 'HalfStudentT')
+        assert_negative_support(sigma, 'sigma', 'HalfStudentT')
         assert_negative_support(lam, 'lam', 'HalfStudentT')
         assert_negative_support(nu, 'nu', 'HalfStudentT')
 
@@ -2786,8 +2811,8 @@ def random(self, point=None, size=None):
         -------
         array
         """
-        nu, sd = draw_values([self.nu, self.sd], point=point, size=size)
-        return np.abs(generate_samples(stats.t.rvs, nu, loc=0, scale=sd,
+        nu, sigma = draw_values([self.nu, self.sigma], point=point, size=size)
+        return np.abs(generate_samples(stats.t.rvs, nu, loc=0, scale=sigma,
                                        dist_shape=self.shape,
                                        size=size))
 
@@ -2806,24 +2831,24 @@ def logp(self, value):
         TensorVariable
         """
         nu = self.nu
-        sd = self.sd
+        sigma = self.sigma
         lam = self.lam
 
         return bound(tt.log(2) + gammaln((nu + 1.0) / 2.0)
                      - gammaln(nu / 2.0)
-                     - .5 * tt.log(nu * np.pi * sd**2)
-                     - (nu + 1.0) / 2.0 * tt.log1p(value ** 2 / (nu * sd**2)),
-                     sd > 0, lam > 0, nu > 0, value >= 0)
+                     - .5 * tt.log(nu * np.pi * sigma**2)
+                     - (nu + 1.0) / 2.0 * tt.log1p(value ** 2 / (nu * sigma**2)),
+                     sigma > 0, lam > 0, nu > 0, value >= 0)
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
             dist = self
         nu = dist.nu
-        sd = dist.sd
+        sigma = dist.sigma
         name = r'\text{%s}' % name
-        return r'${} \sim \text{{HalfStudentT}}(\mathit{{nu}}={},~\mathit{{sd}}={})$'.format(name,
+        return r'${} \sim \text{{HalfStudentT}}(\mathit{{nu}}={},~\mathit{{sigma}}={})$'.format(name,
                                                                 get_variable_name(nu),
-                                                                get_variable_name(sd))
+                                                                get_variable_name(sigma))
 
 
 class ExGaussian(Continuous):
@@ -2853,11 +2878,11 @@ class ExGaussian(Continuous):
         plt.style.use('seaborn-darkgrid')
         x = np.linspace(-6, 9, 200)
         mus = [0., -2., 0., -3.]
-        sds = [1., 1., 3., 1.]
+        sigmas = [1., 1., 3., 1.]
         nus = [1., 1., 1., 4.]
-        for mu, sd, nu in zip(mus, sds, nus):
-            pdf = st.exponnorm.pdf(x, nu/sd, loc=mu, scale=sd)
-            plt.plot(x, pdf, label=r'$\mu$ = {}, $\sigma$ = {}, $\nu$ = {}'.format(mu, sd, nu))
+        for mu, sigma, nu in zip(mus, sigmas, nus):
+            pdf = st.exponnorm.pdf(x, nu/sigma, loc=mu, scale=sigma)
+            plt.plot(x, pdf, label=r'$\mu$ = {}, $\sigma$ = {}, $\nu$ = {}'.format(mu, sigma, nu))
         plt.xlabel('x', fontsize=12)
         plt.ylabel('f(x)', fontsize=12)
         plt.legend(loc=1)
@@ -2891,10 +2916,15 @@ class ExGaussian(Continuous):
         Vol. 4, No. 1, pp 35-45.
     """
 
-    def __init__(self, mu, sigma, nu, *args, **kwargs):
+    def __init__(self, mu=0., sigma=None, nu=None, sd=None,
+                 *args, **kwargs):
         super().__init__(*args, **kwargs)
+
+        if sd is not None:
+            sigma = sd
+
         self.mu = mu = tt.as_tensor_variable(mu)
-        self.sigma = sigma = tt.as_tensor_variable(sigma)
+        self.sigma = self.sd = sigma = tt.as_tensor_variable(sigma)
         self.nu = nu = tt.as_tensor_variable(nu)
         self.mean = mu + nu
         self.variance = (sigma**2) + (nu**2)
@@ -3146,8 +3176,8 @@ class SkewNormal(Continuous):
     ----------
     mu : float
         Location parameter.
-    sd : float
-        Scale parameter (sd > 0).
+    sigma : float
+        Scale parameter (sigma > 0).
     tau : float
         Alternative scale parameter (tau > 0).
     alpha : float
@@ -3156,25 +3186,30 @@ class SkewNormal(Continuous):
     Notes
     -----
     When alpha=0 we recover the Normal distribution and mu becomes the mean,
-    tau the precision and sd the standard deviation. In the limit of alpha
+    tau the precision and sigma the standard deviation. In the limit of alpha
     approaching plus/minus infinite we get a half-normal distribution.
 
     """
 
-    def __init__(self, mu=0.0, sd=None, tau=None, alpha=1, *args, **kwargs):
+    def __init__(self, mu=0.0, sigma=None, tau=None, alpha=1, sd=None,
+                 *args, **kwargs):
         super().__init__(*args, **kwargs)
-        tau, sd = get_tau_sd(tau=tau, sd=sd)
+
+        if sd is not None:
+            sigma = sd
+
+        tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
         self.mu = mu = tt.as_tensor_variable(mu)
         self.tau = tt.as_tensor_variable(tau)
-        self.sd = tt.as_tensor_variable(sd)
+        self.sigma = self.sd = tt.as_tensor_variable(sigma)
 
         self.alpha = alpha = tt.as_tensor_variable(alpha)
 
-        self.mean = mu + self.sd * (2 / np.pi)**0.5 * alpha / (1 + alpha**2)**0.5
-        self.variance = self.sd**2 * (1 - (2 * alpha**2) / ((1 + alpha**2) * np.pi))
+        self.mean = mu + self.sigma * (2 / np.pi)**0.5 * alpha / (1 + alpha**2)**0.5
+        self.variance = self.sigma**2 * (1 - (2 * alpha**2) / ((1 + alpha**2) * np.pi))
 
         assert_negative_support(tau, 'tau', 'SkewNormal')
-        assert_negative_support(sd, 'sd', 'SkewNormal')
+        assert_negative_support(sigma, 'sigma', 'SkewNormal')
 
     def random(self, point=None, size=None):
         """
@@ -3194,7 +3229,7 @@ def random(self, point=None, size=None):
         array
         """
         mu, tau, _, alpha = draw_values(
-            [self.mu, self.tau, self.sd, self.alpha], point=point, size=size)
+            [self.mu, self.tau, self.sigma, self.alpha], point=point, size=size)
         return generate_samples(stats.skewnorm.rvs,
                                 a=alpha, loc=mu, scale=tau**-0.5,
                                 dist_shape=self.shape,
@@ -3215,7 +3250,7 @@ def logp(self, value):
         TensorVariable
         """
         tau = self.tau
-        sd = self.sd
+        sigma = self.sigma
         mu = self.mu
         alpha = self.alpha
         return bound(
@@ -3223,18 +3258,18 @@ def logp(self, value):
                    tt.erf(((value - mu) * tt.sqrt(tau) * alpha) / tt.sqrt(2)))
             + (-tau * (value - mu)**2
                + tt.log(tau / np.pi / 2.)) / 2.,
-            tau > 0, sd > 0)
+            tau > 0, sigma > 0)
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
             dist = self
-        sd = dist.sd
+        sigma = dist.sigma
         mu = dist.mu
         alpha = dist.alpha
         name = r'\text{%s}' % name
-        return r'${} \sim \text{{Skew-Normal}}(\mathit{{mu}}={},~\mathit{{sd}}={},~\mathit{{alpha}}={})$'.format(name,
+        return r'${} \sim \text{{Skew-Normal}}(\mathit{{mu}}={},~\mathit{{sigma}}={},~\mathit{{alpha}}={})$'.format(name,
                                                                 get_variable_name(mu),
-                                                                get_variable_name(sd),
+                                                                get_variable_name(sigma),
                                                                 get_variable_name(alpha))
 
 
@@ -3458,8 +3493,8 @@ def random(self, point=None, size=None):
         -------
         array
         """
-        mu, sd = draw_values([self.mu, self.beta], point=point, size=size)
-        return generate_samples(stats.gumbel_r.rvs, loc=mu, scale=sd,
+        mu, sigma = draw_values([self.mu, self.beta], point=point, size=size)
+        return generate_samples(stats.gumbel_r.rvs, loc=mu, scale=sigma,
                                 dist_shape=self.shape,
                                 size=size)
 
@@ -3518,7 +3553,7 @@ class Rice(PositiveContinuous):
     ----------
     nu : float
         noncentrality parameter.
-    sd : float
+    sigma : float
         scale parameter.
     b : float
         shape parameter (alternative to nu).
@@ -3539,26 +3574,29 @@ class Rice(PositiveContinuous):
 
     """
 
-    def __init__(self, nu=None, sd=None, b=None, *args, **kwargs):
+    def __init__(self, nu=None, sigma=None, b=None, sd=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        nu, b, sd = self.get_nu_b(nu, b, sd)
+        if sd is not None:
+            sigma = sd
+
+        nu, b, sigma = self.get_nu_b(nu, b, sigma)
         self.nu = nu = tt.as_tensor_variable(nu)
-        self.sd = sd = tt.as_tensor_variable(sd)
+        self.sigma = self.sd = sigma = tt.as_tensor_variable(sigma)
         self.b = b = tt.as_tensor_variable(b)
-        self.mean = sd * np.sqrt(np.pi / 2) * tt.exp((-nu**2 / (2 * sd**2)) / 2) * ((1 - (-nu**2 / (2 * sd**2)))
-                                 * tt.i0(-(-nu**2 / (2 * sd**2)) / 2) - (-nu**2 / (2 * sd**2)) * tt.i1(-(-nu**2 / (2 * sd**2)) / 2))
-        self.variance = 2 * sd**2 + nu**2 - (np.pi * sd**2 / 2) * (tt.exp((-nu**2 / (2 * sd**2)) / 2) * ((1 - (-nu**2 / (
-            2 * sd**2))) * tt.i0(-(-nu**2 / (2 * sd**2)) / 2) - (-nu**2 / (2 * sd**2)) * tt.i1(-(-nu**2 / (2 * sd**2)) / 2)))**2
-
-    def get_nu_b(self, nu, b, sd):
-        if sd is None:
-            sd = 1.
+        self.mean = sigma * np.sqrt(np.pi / 2) * tt.exp((-nu**2 / (2 * sigma**2)) / 2) * ((1 - (-nu**2 / (2 * sigma**2)))
+                                 * tt.i0(-(-nu**2 / (2 * sigma**2)) / 2) - (-nu**2 / (2 * sigma**2)) * tt.i1(-(-nu**2 / (2 * sigma**2)) / 2))
+        self.variance = 2 * sigma**2 + nu**2 - (np.pi * sigma**2 / 2) * (tt.exp((-nu**2 / (2 * sigma**2)) / 2) * ((1 - (-nu**2 / (
+            2 * sigma**2))) * tt.i0(-(-nu**2 / (2 * sigma**2)) / 2) - (-nu**2 / (2 * sigma**2)) * tt.i1(-(-nu**2 / (2 * sigma**2)) / 2)))**2
+
+    def get_nu_b(self, nu, b, sigma):
+        if sigma is None:
+            sigma = 1.
         if nu is None and b is not None:
-            nu = b * sd
-            return nu, b, sd
+            nu = b * sigma
+            return nu, b, sigma
         elif nu is not None and b is None:
-            b = nu / sd
-            return nu, b, sd
+            b = nu / sigma
+            return nu, b, sigma
         raise ValueError('Rice distribution must specify either nu'
                          ' or b.')
 
@@ -3579,9 +3617,9 @@ def random(self, point=None, size=None):
         -------
         array
         """
-        nu, sd = draw_values([self.nu, self.sd],
+        nu, sigma = draw_values([self.nu, self.sigma],
                              point=point, size=size)
-        return generate_samples(stats.rice.rvs, b=nu / sd, scale=sd, loc=0,
+        return generate_samples(stats.rice.rvs, b=nu / sigma, scale=sigma, loc=0,
                                 dist_shape=self.shape, size=size)
 
     def logp(self, value):
@@ -3599,11 +3637,11 @@ def logp(self, value):
         TensorVariable
         """
         nu = self.nu
-        sd = self.sd
+        sigma = self.sigma
         b = self.b
-        x = value / sd
-        return bound(tt.log(x * tt.exp((-(x - b) * (x - b)) / 2) * i0e(x * b) / sd),
-                     sd >= 0,
+        x = value / sigma
+        return bound(tt.log(x * tt.exp((-(x - b) * (x - b)) / 2) * i0e(x * b) / sigma),
+                     sigma >= 0,
                      nu >= 0,
                      value > 0,
                      )
@@ -3762,10 +3800,10 @@ class LogitNormal(UnitContinuous):
         plt.style.use('seaborn-darkgrid')
         x = np.linspace(0.0001, 0.9999, 500)
         mus = [0., 0., 0., 1.]
-        sds = [0.3, 1., 2., 1.]
-        for mu, sd in  zip(mus, sds):
-            pdf = st.norm.pdf(logit(x), loc=mu, scale=sd) * 1/(x * (1-x))
-            plt.plot(x, pdf, label=r'$\mu$ = {}, $\sigma$ = {}'.format(mu, sd))
+        sigmas = [0.3, 1., 2., 1.]
+        for mu, sigma in  zip(mus, sigmas):
+            pdf = st.norm.pdf(logit(x), loc=mu, scale=sigma) * 1/(x * (1-x))
+            plt.plot(x, pdf, label=r'$\mu$ = {}, $\sigma$ = {}'.format(mu, sigma))
             plt.legend(loc=1)
         plt.show()
 
@@ -3779,20 +3817,22 @@ class LogitNormal(UnitContinuous):
     ----------
     mu : float
         Location parameter.
-    sd : float
-        Scale parameter (sd > 0).
+    sigma : float
+        Scale parameter (sigma > 0).
     tau : float
         Scale parameter (tau > 0).
     """
 
-    def __init__(self, mu=0, sd=None, tau=None, **kwargs):
+    def __init__(self, mu=0, sigma=None, tau=None, sd=None, **kwargs):
+        if sd is not None:
+            sigma = sd
         self.mu = mu = tt.as_tensor_variable(mu)
-        tau, sd = get_tau_sd(tau=tau, sd=sd)
-        self.sd = tt.as_tensor_variable(sd)
+        tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
+        self.sigma = self.sd = tt.as_tensor_variable(sigma)
         self.tau = tau = tt.as_tensor_variable(tau)
 
         self.median = invlogit(mu)
-        assert_negative_support(sd, 'sd', 'LogitNormal')
+        assert_negative_support(sigma, 'sigma', 'LogitNormal')
         assert_negative_support(tau, 'tau', 'LogitNormal')
 
         super().__init__(**kwargs)
@@ -3814,9 +3854,9 @@ def random(self, point=None, size=None):
         -------
         array
         """
-        mu, _, sd = draw_values(
-            [self.mu, self.tau, self.sd], point=point, size=size)
-        return expit(generate_samples(stats.norm.rvs, loc=mu, scale=sd, dist_shape=self.shape,
+        mu, _, sigma = draw_values(
+            [self.mu, self.tau, self.sigma], point=point, size=size)
+        return expit(generate_samples(stats.norm.rvs, loc=mu, scale=sigma, dist_shape=self.shape,
                                       size=size))
 
     def logp(self, value):
@@ -3833,7 +3873,7 @@ def logp(self, value):
         -------
         TensorVariable
         """
-        sd = self.sd
+        sigma = self.sigma
         mu = self.mu
         tau = self.tau
         return bound(-0.5 * tau * (logit(value) - mu) ** 2
@@ -3843,12 +3883,12 @@ def logp(self, value):
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
             dist = self
-        sd = dist.sd
+        sigma = dist.sigma
         mu = dist.mu
         name = r'\text{%s}' % name
-        return r'${} \sim \text{{LogitNormal}}(\mathit{{mu}}={},~\mathit{{sd}}={})$'.format(name,
+        return r'${} \sim \text{{LogitNormal}}(\mathit{{mu}}={},~\mathit{{sigma}}={})$'.format(name,
                                                                 get_variable_name(mu),
-                                                                get_variable_name(sd))
+                                                                get_variable_name(sigma))
 
 
 class Interpolated(BoundedContinuous):
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index b2adc6f069..f4f0b99280 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -1153,7 +1153,7 @@ class OrderedLogistic(Categorical):
 
         # Ordered logistic regression
         with pm.Model() as model:
-            cutpoints = pm.Normal("cutpoints", mu=[-1,1], sd=10, shape=2,
+            cutpoints = pm.Normal("cutpoints", mu=[-1,1], sigma=10, shape=2,
                                   transform=pm.distributions.transforms.ordered)
             y_ = pm.OrderedLogistic("y", cutpoints=cutpoints, eta=x, observed=y)
             tr = pm.sample(1000)
diff --git a/pymc3/distributions/dist_math.py b/pymc3/distributions/dist_math.py
index 957e8b25a9..e2d540affc 100644
--- a/pymc3/distributions/dist_math.py
+++ b/pymc3/distributions/dist_math.py
@@ -106,19 +106,21 @@ def normal_lccdf(mu, sigma, x):
     )
 
 
-def sd2rho(sd):
+def sigma2rho(sigma):
     """
-    `sd -> rho` theano converter
-    :math:`mu + sd*e = mu + log(1+exp(rho))*e`"""
-    return tt.log(tt.exp(tt.abs_(sd)) - 1.)
+    `sigma -> rho` theano converter
+    :math:`mu + sigma*e = mu + log(1+exp(rho))*e`"""
+    return tt.log(tt.exp(tt.abs_(sigma)) - 1.)
 
 
-def rho2sd(rho):
+def rho2sigma(rho):
     """
-    `rho -> sd` theano converter
-    :math:`mu + sd*e = mu + log(1+exp(rho))*e`"""
+    `rho -> sigma` theano converter
+    :math:`mu + sigma*e = mu + log(1+exp(rho))*e`"""
     return tt.nnet.softplus(rho)
 
+rho2sd = rho2sigma
+sd2rho = sigma2rho
 
 def log_normal(x, mean, **kwargs):
     """
@@ -131,7 +133,7 @@ def log_normal(x, mean, **kwargs):
         point of evaluation
     mean : Tensor
         mean of normal distribution
-    kwargs : one of parameters `{sd, tau, w, rho}`
+    kwargs : one of parameters `{sigma, tau, w, rho}`
 
     Notes
     -----
@@ -143,22 +145,22 @@ def log_normal(x, mean, **kwargs):
         4) `tau` that follows this equation :math:`tau = std^{-1}`
     ----
     """
-    sd = kwargs.get('sd')
+    sigma = kwargs.get('sigma')
     w = kwargs.get('w')
     rho = kwargs.get('rho')
     tau = kwargs.get('tau')
     eps = kwargs.get('eps', 0.)
-    check = sum(map(lambda a: a is not None, [sd, w, rho, tau]))
+    check = sum(map(lambda a: a is not None, [sigma, w, rho, tau]))
     if check > 1:
         raise ValueError('more than one required kwarg is passed')
     if check == 0:
         raise ValueError('none of required kwarg is passed')
-    if sd is not None:
-        std = sd
+    if sigma is not None:
+        std = sigma
     elif w is not None:
         std = tt.exp(w)
     elif rho is not None:
-        std = rho2sd(rho)
+        std = rho2sigma(rho)
     else:
         std = tau**(-1)
     std += f(eps)
@@ -328,11 +330,11 @@ def random_choice(*args, **kwargs):
     return samples
 
 
-def zvalue(value, sd, mu):
+def zvalue(value, sigma, mu):
     """
     Calculate the z-value for a normal distribution.
     """
-    return (value - mu) / sd
+    return (value - mu) / sigma
 
 
 def incomplete_beta_cfe(a, b, x, small):
diff --git a/pymc3/distributions/mixture.py b/pymc3/distributions/mixture.py
index b71af8a3ef..817a6343d6 100644
--- a/pymc3/distributions/mixture.py
+++ b/pymc3/distributions/mixture.py
@@ -6,7 +6,7 @@
 from .dist_math import bound, random_choice
 from .distribution import (Discrete, Distribution, draw_values,
                            generate_samples, _DrawValuesContext)
-from .continuous import get_tau_sd, Normal
+from .continuous import get_tau_sigma, Normal
 
 
 def all_discrete(comp_dists):
@@ -207,7 +207,7 @@ class NormalMixture(Mixture):
         the mixture weights
     mu : array of floats
         the component means
-    sd : array of floats
+    sigma : array of floats
         the component standard deviations
     tau : array of floats
         the component precisions
@@ -216,26 +216,30 @@ class NormalMixture(Mixture):
         of the mixture distribution, with one axis being
         the number of components.
 
-    Note: You only have to pass in sd or tau, but not both.
+    Note: You only have to pass in sigma or tau, but not both.
     """
 
     def __init__(self, w, mu, comp_shape=(), *args, **kwargs):
-        _, sd = get_tau_sd(tau=kwargs.pop('tau', None),
-                           sd=kwargs.pop('sd', None))
+        if 'sd' in kwargs.keys():
+            kwargs['sigma'] = kwargs.pop('sd')
+
+        _, sigma = get_tau_sigma(tau=kwargs.pop('tau', None),
+                           sigma=kwargs.pop('sigma', None))
 
         self.mu = mu = tt.as_tensor_variable(mu)
-        self.sd = sd = tt.as_tensor_variable(sd)
+        self.sigma = self.sd = sigma = tt.as_tensor_variable(sigma)
 
-        super().__init__(w, Normal.dist(mu, sd=sd, shape=comp_shape), *args, **kwargs)
+        super().__init__(w, Normal.dist(mu, sigma=sigma, shape=comp_shape),
+                                            *args, **kwargs)
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
             dist = self
         mu = dist.mu
         w = dist.w
-        sd = dist.sd
+        sigma = dist.sigma
         name = r'\text{%s}' % name
         return r'${} \sim \text{{NormalMixture}}(\mathit{{w}}={},~\mathit{{mu}}={},~\mathit{{sigma}}={})$'.format(name,
                                                 get_variable_name(w),
                                                 get_variable_name(mu),
-                                                get_variable_name(sd))
+                                                get_variable_name(sigma))
diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index 217c4309dc..04ae00e56f 100755
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -215,7 +215,7 @@ class MvNormal(_QuadFormBase):
         chol_packed = pm.LKJCholeskyCov('chol_packed',
             n=3, eta=2, sd_dist=sd_dist)
         chol = pm.expand_packed_triangular(3, chol_packed)
-        vals_raw = pm.Normal('vals_raw', mu=0, sd=1, shape=(5, 3))
+        vals_raw = pm.Normal('vals_raw', mu=0, sigma=1, shape=(5, 3))
         vals = pm.Deterministic('vals', tt.dot(chol, vals_raw.T).T)
     """
 
@@ -938,7 +938,7 @@ class LKJCholeskyCov(Continuous):
             vals = pm.MvNormal('vals', mu=np.zeros(10), chol=chol, shape=10)
 
             # Or transform an uncorrelated normal:
-            vals_raw = pm.Normal('vals_raw', mu=0, sd=1, shape=10)
+            vals_raw = pm.Normal('vals_raw', mu=0, sigma=1, shape=10)
             vals = tt.dot(chol, vals_raw)
 
             # Or compute the covariance matrix
@@ -1252,7 +1252,7 @@ class MatrixNormal(Continuous):
             colchol = pm.expand_packed_triangular(3, colchol_packed)
 
             # Setup left covariance matrix
-            scale = pm.Lognormal('scale', mu=np.log(true_scale), sd=0.5)
+            scale = pm.Lognormal('scale', mu=np.log(true_scale), sigma=0.5)
             rowcov = tt.nlinalg.diag([scale**(2*i) for i in range(m)])
 
             vals = pm.MatrixNormal('vals', mu=mu, colchol=colchol, rowcov=rowcov,
diff --git a/pymc3/distributions/timeseries.py b/pymc3/distributions/timeseries.py
index ec87f426cc..947a3bbdd8 100644
--- a/pymc3/distributions/timeseries.py
+++ b/pymc3/distributions/timeseries.py
@@ -2,7 +2,7 @@
 from theano import scan
 
 from pymc3.util import get_variable_name
-from .continuous import get_tau_sd, Normal, Flat
+from .continuous import get_tau_sigma, Normal, Flat
 from . import multivariate
 from . import distribution
 
@@ -79,23 +79,25 @@ class AR(distribution.Continuous):
     ----------
     rho : tensor
         Tensor of autoregressive coefficients. The first dimension is the p lag.
-    sd : float
-        Standard deviation of innovation (sd > 0). (only required if tau is not specified)
+    sigma : float
+        Standard deviation of innovation (sigma > 0). (only required if tau is not specified)
     tau : float
-        Precision of innovation (tau > 0). (only required if sd is not specified)
+        Precision of innovation (tau > 0). (only required if sigma is not specified)
     constant: bool (optional, default = False)
         Whether to include a constant.
     init : distribution
         distribution for initial values (Defaults to Flat())
     """
 
-    def __init__(self, rho, sd=None, tau=None,
+    def __init__(self, rho, sigma=None, tau=None,
                  constant=False, init=Flat.dist(),
-                 *args, **kwargs):
-
+                 sd=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        tau, sd = get_tau_sd(tau=tau, sd=sd)
-        self.sd = tt.as_tensor_variable(sd)
+        if sd is not None:
+            sigma = sd
+
+        tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
+        self.sigma = self.sd = tt.as_tensor_variable(sigma)
         self.tau = tt.as_tensor_variable(tau)
 
         self.mean = tt.as_tensor_variable(0.)
@@ -147,45 +149,47 @@ class GaussianRandomWalk(distribution.Continuous):
     ----------
     mu: tensor
         innovation drift, defaults to 0.0
-    sd : tensor
-        sd > 0, innovation standard deviation (only required if tau is not specified)
+    sigma : tensor
+        sigma > 0, innovation standard deviation (only required if tau is not specified)
     tau : tensor
-        tau > 0, innovation precision (only required if sd is not specified)
+        tau > 0, innovation precision (only required if sigma is not specified)
     init : distribution
         distribution for initial value (Defaults to Flat())
     """
 
-    def __init__(self, tau=None, init=Flat.dist(), sd=None, mu=0.,
-                 *args, **kwargs):
+    def __init__(self, tau=None, init=Flat.dist(), sigma=None, mu=0.,
+                 sd=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        tau, sd = get_tau_sd(tau=tau, sd=sd)
+        if sd is not None:
+            sigma = sd
+        tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
         self.tau = tau = tt.as_tensor_variable(tau)
-        self.sd = sd = tt.as_tensor_variable(sd)
+        self.sigma = self.sd = sigma = tt.as_tensor_variable(sigma)
         self.mu = mu = tt.as_tensor_variable(mu)
         self.init = init
         self.mean = tt.as_tensor_variable(0.)
 
     def logp(self, x):
         tau = self.tau
-        sd = self.sd
+        sigma = self.sigma
         mu = self.mu
         init = self.init
 
         x_im1 = x[:-1]
         x_i = x[1:]
 
-        innov_like = Normal.dist(mu=x_im1 + mu, sd=sd).logp(x_i)
+        innov_like = Normal.dist(mu=x_im1 + mu, sigma=sigma).logp(x_i)
         return init.logp(x[0]) + tt.sum(innov_like)
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
             dist = self
         mu = dist.mu
-        sd = dist.sd
+        sigma = dist.sigma
         name = r'\text{%s}' % name
-        return r'${} \sim \text{{GaussianRandomWalk}}(\mathit{{mu}}={},~\mathit{{sd}}={})$'.format(name,
+        return r'${} \sim \text{{GaussianRandomWalk}}(\mathit{{mu}}={},~\mathit{{sigma}}={})$'.format(name,
                                                 get_variable_name(mu),
-                                                get_variable_name(sd))
+                                                get_variable_name(sigma))
 
 
 class GARCH11(distribution.Continuous):
@@ -237,7 +241,7 @@ def volatility_update(x, vol, w, a, b):
 
     def logp(self, x):
         vol = self.get_volatility(x)
-        return tt.sum(Normal.dist(0., sd=vol).logp(x))
+        return tt.sum(Normal.dist(0., sigma=vol).logp(x))
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
@@ -277,7 +281,7 @@ def logp(self, x):
         f, g = self.sde_fn(x[:-1], *self.sde_pars)
         mu = xt + self.dt * f
         sd = tt.sqrt(self.dt) * g
-        return tt.sum(Normal.dist(mu=mu, sd=sd).logp(x[1:]))
+        return tt.sum(Normal.dist(mu=mu, sigma=sd).logp(x[1:]))
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
diff --git a/pymc3/examples/GHME_2013.py b/pymc3/examples/GHME_2013.py
index bb1e57c7e0..253da9eb19 100644
--- a/pymc3/examples/GHME_2013.py
+++ b/pymc3/examples/GHME_2013.py
@@ -45,13 +45,13 @@ def interpolate(x0, y0, x, group):
 with Model() as model:
     coeff_sd = HalfCauchy('coeff_sd', 5)
 
-    y = GaussianRandomWalk('y', sd=coeff_sd, shape=(nknots, ncountries))
+    y = GaussianRandomWalk('y', sigma=coeff_sd, shape=(nknots, ncountries))
 
     p = interpolate(knots, y, age, group)
 
     sd = HalfCauchy('sd', 5)
 
-    vals = Normal('vals', p, sd=sd, observed=rate)
+    vals = Normal('vals', p, sigma=sd, observed=rate)
 
 
 def run(n=3000):
diff --git a/pymc3/examples/LKJ_correlation.py b/pymc3/examples/LKJ_correlation.py
index b2dc0bb1ff..f2accfab0f 100644
--- a/pymc3/examples/LKJ_correlation.py
+++ b/pymc3/examples/LKJ_correlation.py
@@ -24,7 +24,7 @@
 
 with pm.Model() as model:
 
-    mu = pm.Normal('mu', mu=0, sd=1, shape=n_var)
+    mu = pm.Normal('mu', mu=0, sigma=1, shape=n_var)
 
     # Note that we access the distribution for the standard
     # deviations, and do not create a new random variable.
diff --git a/pymc3/examples/arma_example.py b/pymc3/examples/arma_example.py
index 55889bbf97..7a75b8e650 100644
--- a/pymc3/examples/arma_example.py
+++ b/pymc3/examples/arma_example.py
@@ -54,9 +54,9 @@ def build_model():
     y = shared(np.array([15, 10, 16, 11, 9, 11, 10, 18], dtype=np.float32))
     with pm.Model() as arma_model:
         sigma = pm.HalfNormal('sigma', 5.)
-        theta = pm.Normal('theta', 0., sd=1.)
-        phi = pm.Normal('phi', 0., sd=2.)
-        mu = pm.Normal('mu', 0., sd=10.)
+        theta = pm.Normal('theta', 0., sigma=1.)
+        phi = pm.Normal('phi', 0., sigma=2.)
+        mu = pm.Normal('mu', 0., sigma=10.)
 
         err0 = y[0] - (mu + phi * mu)
 
@@ -69,7 +69,7 @@ def calc_next(last_y, this_y, err, mu, phi, theta):
                       outputs_info=[err0],
                       non_sequences=[mu, phi, theta])
 
-        pm.Potential('like', pm.Normal.dist(0, sd=sigma).logp(err))
+        pm.Potential('like', pm.Normal.dist(0, sigma=sigma).logp(err))
     return arma_model
 
 
diff --git a/pymc3/examples/custom_dists.py b/pymc3/examples/custom_dists.py
index a535fd4fff..ba920a8e51 100644
--- a/pymc3/examples/custom_dists.py
+++ b/pymc3/examples/custom_dists.py
@@ -32,13 +32,13 @@ def loglike2(value):
     return -tt.log(tt.abs_(value))
 
 with pm.Model() as model:
-    alpha = pm.Normal('intercept', mu=0, sd=100)
+    alpha = pm.Normal('intercept', mu=0, sigma=100)
     # Create custom densities
     beta = pm.DensityDist('slope', loglike1, testval=0)
     sigma = pm.DensityDist('sigma', loglike2, testval=1)
     # Create likelihood
     like = pm.Normal('y_est', mu=alpha + beta *
-                        xdata, sd=sigma, observed=ydata)
+                        xdata, sigma=sigma, observed=ydata)
 
     trace = pm.sample(2000, cores=2)
 
diff --git a/pymc3/examples/garch_example.py b/pymc3/examples/garch_example.py
index d530e76011..ad6e95fc8a 100644
--- a/pymc3/examples/garch_example.py
+++ b/pymc3/examples/garch_example.py
@@ -40,10 +40,10 @@ def get_garch_model():
     with Model() as garch:
         alpha1 = Uniform('alpha1', 0., 1., shape=shape)
         beta1 = Uniform('beta1', 0., 1 - alpha1, shape=shape)
-        mu = Normal('mu', mu=0., sd=100., shape=shape)
+        mu = Normal('mu', mu=0., sigma=100., shape=shape)
         theta = tt.sqrt(alpha0 + alpha1 * tt.pow(r - mu, 2) +
                         beta1 * tt.pow(sigma1, 2))
-        Normal('obs', mu, sd=theta, observed=r)
+        Normal('obs', mu, sigma=theta, observed=r)
     return garch
 
 
diff --git a/pymc3/examples/gelman_bioassay.py b/pymc3/examples/gelman_bioassay.py
index 69d7ca1118..611e29ff76 100644
--- a/pymc3/examples/gelman_bioassay.py
+++ b/pymc3/examples/gelman_bioassay.py
@@ -9,8 +9,8 @@
 with pm.Model() as model:
 
     # Logit-linear model parameters
-    alpha = pm.Normal('alpha', 0, sd=100.)
-    beta = pm.Normal('beta', 0, sd=1.)
+    alpha = pm.Normal('alpha', 0, sigma=100.)
+    beta = pm.Normal('beta', 0, sigma=1.)
 
     # Calculate probabilities of death
     theta = pm.Deterministic('theta', pm.math.invlogit(alpha + beta * dose))
diff --git a/pymc3/examples/gelman_schools.py b/pymc3/examples/gelman_schools.py
index 0331f81682..e7276f53a9 100644
--- a/pymc3/examples/gelman_schools.py
+++ b/pymc3/examples/gelman_schools.py
@@ -31,12 +31,12 @@
 with Model() as schools:
 
     eta = Normal('eta', 0, 1, shape=J)
-    mu = Normal('mu', 0, sd=1e6)
+    mu = Normal('mu', 0, sigma=1e6)
     tau = HalfCauchy('tau', 25)
 
     theta = mu + tau * eta
 
-    obs = Normal('obs', theta, sd=sigma, observed=y)
+    obs = Normal('obs', theta, sigma=sigma, observed=y)
 
 
 def run(n=1000):
diff --git a/pymc3/examples/lightspeed_example.py b/pymc3/examples/lightspeed_example.py
index 23c09f728b..e12d88f2b9 100644
--- a/pymc3/examples/lightspeed_example.py
+++ b/pymc3/examples/lightspeed_example.py
@@ -21,7 +21,7 @@
                        1000.0, upper=light_speed.std() * 1000.0)
 
     # define likelihood
-    y_obs = pm.Normal('Y_obs', mu=mu, sd=sigma, observed=light_speed)
+    y_obs = pm.Normal('Y_obs', mu=mu, sigma=sigma, observed=light_speed)
 
 
 def run(n=5000):
diff --git a/pymc3/examples/rankdata_ordered.py b/pymc3/examples/rankdata_ordered.py
index b7fd0000b4..e8981ca0c0 100644
--- a/pymc3/examples/rankdata_ordered.py
+++ b/pymc3/examples/rankdata_ordered.py
@@ -28,7 +28,7 @@
     # sd = pm.HalfCauchy('sigma', 1.)
     latent = pm.Normal('latent',
                        mu=mu[y_argsort],
-                       sd=1.,  # using sd does not work yet
+                       sigma=1.,  # using sd does not work yet
                        transform=pm.distributions.transforms.ordered,
                        shape=y_argsort.shape,
                        testval=np.repeat(np.arange(K)[:,None], J, axis=1).T)
diff --git a/pymc3/examples/simpletest.py b/pymc3/examples/simpletest.py
index d67f63176b..63d3a859e3 100644
--- a/pymc3/examples/simpletest.py
+++ b/pymc3/examples/simpletest.py
@@ -9,9 +9,9 @@
 
 
 with pm.Model() as model:
-    x = pm.Normal('x', mu=.5, sd=2., shape=(2, 1))
+    x = pm.Normal('x', mu=.5, sigma=2., shape=(2, 1))
     z = pm.Beta('z', alpha=10, beta=5.5)
-    d = pm.Normal('data', mu=x, sd=.75, observed=data)
+    d = pm.Normal('data', mu=x, sigma=.75, observed=data)
 
 
 def run(n=1000):
diff --git a/pymc3/gp/gp.py b/pymc3/gp/gp.py
index 44b3a72ac6..73a25f647f 100644
--- a/pymc3/gp/gp.py
+++ b/pymc3/gp/gp.py
@@ -111,7 +111,7 @@ def _build_prior(self, name, X, reparameterize=True, **kwargs):
         cov = stabilize(self.cov_func(X))
         shape = infer_shape(X, kwargs.pop("shape", None))
         if reparameterize:
-            v = pm.Normal(name + "_rotated_", mu=0.0, sd=1.0, shape=shape, **kwargs)
+            v = pm.Normal(name + "_rotated_", mu=0.0, sigma=1.0, shape=shape, **kwargs)
             f = pm.Deterministic(name, mu + cholesky(cov).dot(v))
         else:
             f = pm.MvNormal(name, mu=mu, cov=cov, shape=shape, **kwargs)
@@ -253,7 +253,7 @@ def _build_prior(self, name, X, reparameterize=True, **kwargs):
         shape = infer_shape(X, kwargs.pop("shape", None))
         if reparameterize:
             chi2 = pm.ChiSquared("chi2_", self.nu)
-            v = pm.Normal(name + "_rotated_", mu=0.0, sd=1.0, shape=shape, **kwargs)
+            v = pm.Normal(name + "_rotated_", mu=0.0, sigma=1.0, shape=shape, **kwargs)
             f = pm.Deterministic(name, (tt.sqrt(self.nu) / chi2) * (mu + cholesky(cov).dot(v)))
         else:
             f = pm.MvStudentT(name, nu=self.nu, mu=mu, cov=cov, shape=shape, **kwargs)
@@ -868,7 +868,7 @@ def _build_prior(self, name, Xs, **kwargs):
         mu = self.mean_func(cartesian(*Xs))
         chols = [cholesky(stabilize(cov(X))) for cov, X in zip(self.cov_funcs, Xs)]
         # remove reparameterization option
-        v = pm.Normal(name + "_rotated_", mu=0.0, sd=1.0, shape=self.N, **kwargs)
+        v = pm.Normal(name + "_rotated_", mu=0.0, sigma=1.0, shape=self.N, **kwargs)
         f = pm.Deterministic(name, mu + tt.flatten(kron_dot(chols, v)))
         return f
 
diff --git a/pymc3/model.py b/pymc3/model.py
index 121faf2133..67ab20f7b9 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -564,7 +564,7 @@ class Model(Context, Factor, WithMemoization, metaclass=InitContextMeta):
 
         class CustomModel(Model):
             # 1) override init
-            def __init__(self, mean=0, sd=1, name='', model=None):
+            def __init__(self, mean=0, sigma=1, name='', model=None):
                 # 2) call super's init first, passing model and name
                 # to it name will be prefix for all variables here if
                 # no name specified for model there will be no prefix
@@ -575,18 +575,18 @@ def __init__(self, mean=0, sd=1, name='', model=None):
                 # will get model's name prefix
 
                 # 3) you can create variables with Var method
-                self.Var('v1', Normal.dist(mu=mean, sd=sd))
+                self.Var('v1', Normal.dist(mu=mean, sigma=sd))
                 # this will create variable named like '{prefix_}v1'
                 # and assign attribute 'v1' to instance created
                 # variable can be accessed with self.v1 or self['v1']
 
                 # 4) this syntax will also work as we are in the
                 # context of instance itself, names are given as usual
-                Normal('v2', mu=mean, sd=sd)
+                Normal('v2', mu=mean, sigma=sd)
 
                 # something more complex is allowed, too
                 half_cauchy = HalfCauchy('sd', beta=10, testval=1.)
-                Normal('v3', mu=mean, sd=half_cauchy)
+                Normal('v3', mu=mean, sigma=half_cauchy)
 
                 # Deterministic variables can be used in usual way
                 Deterministic('v3_sq', self.v3 ** 2)
@@ -606,7 +606,7 @@ def __init__(self, mean=0, sd=1, name='', model=None):
         # II:
         #   use new class as entering point in context
         with CustomModel() as model:
-            Normal('new_normal_var', mu=1, sd=0)
+            Normal('new_normal_var', mu=1, sigma=0)
 
         # III:
         #   just get model instance with all that was defined in it
diff --git a/pymc3/tests/models.py b/pymc3/tests/models.py
index 117d847c3c..f048dd1737 100644
--- a/pymc3/tests/models.py
+++ b/pymc3/tests/models.py
@@ -170,6 +170,6 @@ def simple_normal(bounded_prior=False):
             mu_i = pm.Uniform("mu_i", a, b)
         else:
             mu_i = pm.Flat("mu_i")
-        pm.Normal("X_obs", mu=mu_i, sd=sd, observed=x0)
+        pm.Normal("X_obs", mu=mu_i, sigma=sd, observed=x0)
 
     return model.test_point, model, None
diff --git a/pymc3/tests/sampler_fixtures.py b/pymc3/tests/sampler_fixtures.py
index 45ecfefd91..21f4d1b442 100644
--- a/pymc3/tests/sampler_fixtures.py
+++ b/pymc3/tests/sampler_fixtures.py
@@ -64,7 +64,7 @@ class NormalFixture(KnownMean, KnownVariance, KnownCDF):
     @classmethod
     def make_model(cls):
         with pm.Model() as model:
-            a = pm.Normal("a", mu=2, sd=np.sqrt(3), shape=10)
+            a = pm.Normal("a", mu=2, sigma=np.sqrt(3), shape=10)
         return model
 
 
@@ -88,7 +88,7 @@ class StudentTFixture(KnownMean, KnownCDF):
     @classmethod
     def make_model(cls):
         with pm.Model() as model:
-            a = pm.StudentT("a", nu=4, mu=0, sd=1)
+            a = pm.StudentT("a", nu=4, mu=0, sigma=1)
         return model
 
 
@@ -109,7 +109,7 @@ class LKJCholeskyCovFixture(KnownCDF):
     def make_model(cls):
         with pm.Model() as model:
             sd_mu = np.array([1, 2, 3, 4, 5])
-            sd_dist = pm.Lognormal.dist(mu=sd_mu, sd=sd_mu / 10., shape=5)
+            sd_dist = pm.Lognormal.dist(mu=sd_mu, sigma=sd_mu / 10., shape=5)
             chol_packed = pm.LKJCholeskyCov('chol_packed', eta=3, n=5, sd_dist=sd_dist)
             chol = pm.expand_packed_triangular(5, chol_packed, lower=True)
             cov = tt.dot(chol, chol.T)
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index 0d17f6846e..41bbdf435a 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -194,8 +194,8 @@ def multinomial_logpdf(value, n, p):
         return -inf
 
 
-def beta_mu_sd(value, mu, sd):
-    kappa = mu * (1 - mu) / sd**2 - 1
+def beta_mu_sigma(value, mu, sigma):
+    kappa = mu * (1 - mu) / sigma**2 - 1
     if kappa > 0:
         return sp.beta.logpdf(value, mu * kappa, (1 - mu) * kappa)
     else:
@@ -521,10 +521,10 @@ def test_triangular(self):
 
     def test_bound_normal(self):
         PositiveNormal = Bound(Normal, lower=0.)
-        self.pymc3_matches_scipy(PositiveNormal, Rplus, {'mu': Rplus, 'sd': Rplus},
-                                 lambda value, mu, sd: sp.norm.logpdf(value, mu, sd),
+        self.pymc3_matches_scipy(PositiveNormal, Rplus, {'mu': Rplus, 'sigma': Rplus},
+                                 lambda value, mu, sigma: sp.norm.logpdf(value, mu, sigma),
                                  decimal=select_by_precision(float64=6, float32=-1))
-        with Model(): x = PositiveNormal('x', mu=0, sd=1, transform=None)
+        with Model(): x = PositiveNormal('x', mu=0, sigma=1, transform=None)
         assert np.isinf(x.logp({'x':-1}))
 
     def test_discrete_unif(self):
@@ -554,37 +554,37 @@ def test_half_flat(self):
         assert -np.inf == HalfFlat.dist().logcdf(-np.inf).tag.test_value
 
     def test_normal(self):
-        self.pymc3_matches_scipy(Normal, R, {'mu': R, 'sd': Rplus},
-                                 lambda value, mu, sd: sp.norm.logpdf(value, mu, sd),
+        self.pymc3_matches_scipy(Normal, R, {'mu': R, 'sigma': Rplus},
+                                 lambda value, mu, sigma: sp.norm.logpdf(value, mu, sigma),
                                  decimal=select_by_precision(float64=6, float32=1)
                                  )
-        self.check_logcdf(Normal, R, {'mu': R, 'sd': Rplus},
-                          lambda value, mu, sd: sp.norm.logcdf(value, mu, sd))
+        self.check_logcdf(Normal, R, {'mu': R, 'sigma': Rplus},
+                          lambda value, mu, sigma: sp.norm.logcdf(value, mu, sigma))
 
     def test_truncated_normal(self):
-        def scipy_logp(value, mu, sd, lower, upper):
+        def scipy_logp(value, mu, sigma, lower, upper):
             return sp.truncnorm.logpdf(
-                value, (lower-mu)/sd, (upper-mu)/sd, loc=mu, scale=sd)
+                value, (lower-mu)/sigma, (upper-mu)/sigma, loc=mu, scale=sigma)
 
         args = {'mu': array(-2.1), 'lower': array(-100.), 'upper': array(0.01),
-                'sd': array(0.01)}
+                'sigma': array(0.01)}
         val = TruncatedNormal.dist(**args).logp(0.)
         assert_allclose(val.eval(), scipy_logp(value=0, **args))
 
         self.pymc3_matches_scipy(
             TruncatedNormal, R,
-            {'mu': R, 'sd': Rplusbig, 'lower': -Rplusbig, 'upper': Rplusbig},
+            {'mu': R, 'sigma': Rplusbig, 'lower': -Rplusbig, 'upper': Rplusbig},
             scipy_logp,
             decimal=select_by_precision(float64=6, float32=1)
         )
 
     def test_half_normal(self):
-        self.pymc3_matches_scipy(HalfNormal, Rplus, {'sd': Rplus},
-                                 lambda value, sd: sp.halfnorm.logpdf(value, scale=sd),
+        self.pymc3_matches_scipy(HalfNormal, Rplus, {'sigma': Rplus},
+                                 lambda value, sigma: sp.halfnorm.logpdf(value, scale=sigma),
                                  decimal=select_by_precision(float64=6, float32=-1)
                                  )
-        self.check_logcdf(HalfNormal, Rplus, {'sd': Rplus},
-                          lambda value, sd: sp.halfnorm.logcdf(value, scale=sd))
+        self.check_logcdf(HalfNormal, Rplus, {'sigma': Rplus},
+                          lambda value, sigma: sp.halfnorm.logcdf(value, scale=sigma))
 
     def test_chi_squared(self):
         self.pymc3_matches_scipy(ChiSquared, Rplus, {'nu': Rplusdunif},
@@ -628,7 +628,7 @@ def test_wald(self, value, mu, lam, phi, alpha, logp):
     def test_beta(self):
         self.pymc3_matches_scipy(Beta, Unit, {'alpha': Rplus, 'beta': Rplus},
                                  lambda value, alpha, beta: sp.beta.logpdf(value, alpha, beta))
-        self.pymc3_matches_scipy(Beta, Unit, {'mu': Unit, 'sd': Rplus}, beta_mu_sd)
+        self.pymc3_matches_scipy(Beta, Unit, {'mu': Unit, 'sigma': Rplus}, beta_mu_sigma)
         self.check_logcdf(Beta, Unit, {'alpha': Rplus, 'beta': Rplus},
                                 lambda value, alpha, beta: sp.beta.logcdf(value, alpha, beta))
 
@@ -690,10 +690,10 @@ def test_gamma(self):
             Gamma, Rplus, {'alpha': Rplusbig, 'beta': Rplusbig},
             lambda value, alpha, beta: sp.gamma.logpdf(value, alpha, scale=1.0 / beta))
 
-        def test_fun(value, mu, sd):
-            return sp.gamma.logpdf(value, mu**2 / sd**2, scale=1.0 / (mu / sd**2))
+        def test_fun(value, mu, sigma):
+            return sp.gamma.logpdf(value, mu**2 / sigma**2, scale=1.0 / (mu / sigma**2))
         self.pymc3_matches_scipy(
-            Gamma, Rplus, {'mu': Rplusbig, 'sd': Rplusbig}, test_fun)
+            Gamma, Rplus, {'mu': Rplusbig, 'sigma': Rplusbig}, test_fun)
 
     def test_inverse_gamma(self):
         self.pymc3_matches_scipy(
@@ -703,11 +703,11 @@ def test_inverse_gamma(self):
     @pytest.mark.xfail(condition=(theano.config.floatX == "float32"),
                            reason="Fails on float32 due to scaling issues")
     def test_inverse_gamma_alt_params(self):
-        def test_fun(value, mu, sd):
-            alpha, beta = InverseGamma._get_alpha_beta(None, None, mu, sd)
+        def test_fun(value, mu, sigma):
+            alpha, beta = InverseGamma._get_alpha_beta(None, None, mu, sigma)
             return sp.invgamma.logpdf(value, alpha, scale=beta)
         self.pymc3_matches_scipy(
-            InverseGamma, Rplus, {'mu': Rplus, 'sd': Rplus}, test_fun)
+            InverseGamma, Rplus, {'mu': Rplus, 'sigma': Rplus}, test_fun)
 
     def test_pareto(self):
         self.pymc3_matches_scipy(Pareto, Rplus, {'alpha': Rplusbig, 'm': Rplusbig},
@@ -726,12 +726,12 @@ def test_weibull(self):
 
     def test_half_studentt(self):
         # this is only testing for nu=1 (halfcauchy)
-        self.pymc3_matches_scipy(HalfStudentT, Rplus, {'sd': Rplus},
-                                 lambda value, sd: sp.halfcauchy.logpdf(value, 0, sd))
+        self.pymc3_matches_scipy(HalfStudentT, Rplus, {'sigma': Rplus},
+                                 lambda value, sigma: sp.halfcauchy.logpdf(value, 0, sigma))
 
     def test_skew_normal(self):
-        self.pymc3_matches_scipy(SkewNormal, R, {'mu': R, 'sd': Rplusbig, 'alpha': R},
-                                 lambda value, alpha, mu, sd: sp.skewnorm.logpdf(value, alpha, mu, sd))
+        self.pymc3_matches_scipy(SkewNormal, R, {'mu': R, 'sigma': Rplusbig, 'alpha': R},
+                                 lambda value, alpha, mu, sigma: sp.skewnorm.logpdf(value, alpha, mu, sigma))
 
     def test_binomial(self):
         self.pymc3_matches_scipy(Binomial, Nat, {'n': NatSmall, 'p': Unit},
@@ -1100,9 +1100,9 @@ def test_addpotential(self):
             Potential('value_squared', -value ** 2)
             self.check_dlogp(model, value, R, {})
 
-    def test_get_tau_sd(self):
-        sd = np.array([2])
-        assert_almost_equal(continuous.get_tau_sd(sd=sd), [1. / sd**2, sd])
+    def test_get_tau_sigma(self):
+        sigma = np.array([2])
+        assert_almost_equal(continuous.get_tau_sigma(sigma=sigma), [1. / sigma**2, sigma])
 
     @pytest.mark.parametrize('value,mu,sigma,nu,logp', [
         (0.5, -50.000, 0.500, 0.500, -99.8068528),
@@ -1165,8 +1165,8 @@ def test_logistic(self):
                           decimal=select_by_precision(float64=6, float32=1))
 
     def test_logitnormal(self):
-        self.pymc3_matches_scipy(LogitNormal, Unit, {'mu': R, 'sd': Rplus},
-                                 lambda value, mu, sd: (sp.norm.logpdf(logit(value), mu, sd)
+        self.pymc3_matches_scipy(LogitNormal, Unit, {'mu': R, 'sigma': Rplus},
+                                 lambda value, mu, sigma: (sp.norm.logpdf(logit(value), mu, sigma)
                                                         - (np.log(value) + np.log1p(-value))),
                                  decimal=select_by_precision(float64=6, float32=1))
 
@@ -1175,29 +1175,29 @@ def test_multidimensional_beta_construction(self):
             Beta('beta', alpha=1., beta=1., shape=(10, 20))
 
     def test_rice(self):
-        self.pymc3_matches_scipy(Rice, Rplus, {'nu': Rplus, 'sd': Rplusbig},
-                                 lambda value, nu, sd: sp.rice.logpdf(value, b=nu / sd, loc=0, scale=sd))
-        self.pymc3_matches_scipy(Rice, Rplus, {'b': Rplus, 'sd': Rplusbig},
-                                 lambda value, b, sd: sp.rice.logpdf(value, b=b, loc=0, scale=sd))
+        self.pymc3_matches_scipy(Rice, Rplus, {'nu': Rplus, 'sigma': Rplusbig},
+                                 lambda value, nu, sigma: sp.rice.logpdf(value, b=nu / sigma, loc=0, scale=sigma))
+        self.pymc3_matches_scipy(Rice, Rplus, {'b': Rplus, 'sigma': Rplusbig},
+                                 lambda value, b, sigma: sp.rice.logpdf(value, b=b, loc=0, scale=sigma))
 
     @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
     def test_interpolated(self):
         for mu in R.vals:
-            for sd in Rplus.vals:
+            for sigma in Rplus.vals:
                 #pylint: disable=cell-var-from-loop
-                xmin = mu - 5 * sd
-                xmax = mu + 5 * sd
+                xmin = mu - 5 * sigma
+                xmax = mu + 5 * sigma
 
                 class TestedInterpolated (Interpolated):
                     def __init__(self, **kwargs):
                         x_points = np.linspace(xmin, xmax, 100000)
-                        pdf_points = sp.norm.pdf(x_points, loc=mu, scale=sd)
+                        pdf_points = sp.norm.pdf(x_points, loc=mu, scale=sigma)
                         super().__init__(x_points=x_points, pdf_points=pdf_points, **kwargs)
 
                 def ref_pdf(value):
                     return np.where(
                         np.logical_and(value >= xmin, value <= xmax),
-                        sp.norm.logpdf(value, mu, sd),
+                        sp.norm.logpdf(value, mu, sigma),
                         -np.inf * np.ones(value.shape)
                     )
 
@@ -1207,27 +1207,27 @@ def ref_pdf(value):
 def test_bound():
     np.random.seed(42)
     UnboundNormal = Bound(Normal)
-    dist = UnboundNormal.dist(mu=0, sd=1)
+    dist = UnboundNormal.dist(mu=0, sigma=1)
     assert dist.transform is None
     assert dist.default() == 0.
     assert isinstance(dist.random(), np.ndarray)
 
     LowerNormal = Bound(Normal, lower=1)
-    dist = LowerNormal.dist(mu=0, sd=1)
+    dist = LowerNormal.dist(mu=0, sigma=1)
     assert dist.logp(0).eval() == -np.inf
     assert dist.default() > 1
     assert dist.transform is not None
     assert np.all(dist.random() > 1)
 
     UpperNormal = Bound(Normal, upper=-1)
-    dist = UpperNormal.dist(mu=0, sd=1)
+    dist = UpperNormal.dist(mu=0, sigma=1)
     assert dist.logp(-0.5).eval() == -np.inf
     assert dist.default() < -1
     assert dist.transform is not None
     assert np.all(dist.random() < -1)
 
     ArrayNormal = Bound(Normal, lower=[1, 2], upper=[2, 3])
-    dist = ArrayNormal.dist(mu=0, sd=1, shape=2)
+    dist = ArrayNormal.dist(mu=0, sigma=1, shape=2)
     assert_equal(dist.logp([0.5, 3.5]).eval(), -np.array([np.inf, np.inf]))
     assert_equal(dist.default(), np.array([1.5, 2.5]))
     assert dist.transform is not None
@@ -1243,7 +1243,7 @@ def test_bound():
     lower.tag.test_value = np.array([1, 2]).astype(theano.config.floatX)
     upper = 3
     ArrayNormal = Bound(Normal, lower=lower, upper=upper)
-    dist = ArrayNormal.dist(mu=0, sd=1, shape=2)
+    dist = ArrayNormal.dist(mu=0, sigma=1, shape=2)
     logp = dist.logp([0.5, 3.5]).eval({lower: lower.tag.test_value})
     assert_equal(logp, -np.array([np.inf, np.inf]))
     assert_equal(dist.default(), np.array([2, 2.5]))
@@ -1287,22 +1287,22 @@ def setup_class(self):
         Y = alpha + X.dot(beta) + np.random.randn(size)*sigma
         with Model() as self.model:
             # Priors for unknown model parameters
-            alpha = Normal('alpha', mu=0, sd=10)
-            b = Normal('beta', mu=0, sd=10, shape=(2,), observed=beta)
-            sigma = HalfNormal('sigma', sd=1)
+            alpha = Normal('alpha', mu=0, sigma=10)
+            b = Normal('beta', mu=0, sigma=10, shape=(2,), observed=beta)
+            sigma = HalfNormal('sigma', sigma=1)
 
             # Expected value of outcome
             mu = Deterministic('mu', alpha + tt.dot(X, b))
 
             # Likelihood (sampling distribution) of observations
-            Y_obs = Normal('Y_obs', mu=mu, sd=sigma, observed=Y)
+            Y_obs = Normal('Y_obs', mu=mu, sigma=sigma, observed=Y)
         self.distributions = [alpha, sigma, mu, b, Y_obs]
         self.expected = (
-            r'$\text{alpha} \sim \text{Normal}(\mathit{mu}=0,~\mathit{sd}=10.0)$',
-            r'$\text{sigma} \sim \text{HalfNormal}(\mathit{sd}=1.0)$',
+            r'$\text{alpha} \sim \text{Normal}(\mathit{mu}=0,~\mathit{sigma}=10.0)$',
+            r'$\text{sigma} \sim \text{HalfNormal}(\mathit{sigma}=1.0)$',
             r'$\text{mu} \sim \text{Deterministic}(\text{alpha},~\text{Constant},~\text{beta})$',
-            r'$\text{beta} \sim \text{Normal}(\mathit{mu}=0,~\mathit{sd}=10.0)$',
-            r'$\text{Y_obs} \sim \text{Normal}(\mathit{mu}=\text{mu},~\mathit{sd}=f(\text{sigma}))$'
+            r'$\text{beta} \sim \text{Normal}(\mathit{mu}=0,~\mathit{sigma}=10.0)$',
+            r'$\text{Y_obs} \sim \text{Normal}(\mathit{mu}=\text{mu},~\mathit{sigma}=f(\text{sigma}))$'
         )
 
     def test__repr_latex_(self):
diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
index b5460ca96e..b5c0593806 100644
--- a/pymc3/tests/test_distributions_random.py
+++ b/pymc3/tests/test_distributions_random.py
@@ -72,14 +72,14 @@ def pymc3_random_discrete(dist, paramdomains,
 class TestDrawValues(SeededTest):
     def test_draw_scalar_parameters(self):
         with pm.Model():
-            y = pm.Normal('y1', mu=0., sd=1.)
+            y = pm.Normal('y1', mu=0., sigma=1.)
             mu, tau = draw_values([y.distribution.mu, y.distribution.tau])
         npt.assert_almost_equal(mu, 0)
         npt.assert_almost_equal(tau, 1)
 
     def test_draw_dependencies(self):
         with pm.Model():
-            x = pm.Normal('x', mu=0., sd=1.)
+            x = pm.Normal('x', mu=0., sigma=1.)
             exp_x = pm.Deterministic('exp_x', pm.math.exp(x))
 
         x, exp_x = draw_values([x, exp_x])
@@ -87,7 +87,7 @@ def test_draw_dependencies(self):
 
     def test_draw_order(self):
         with pm.Model():
-            x = pm.Normal('x', mu=0., sd=1.)
+            x = pm.Normal('x', mu=0., sigma=1.)
             exp_x = pm.Deterministic('exp_x', pm.math.exp(x))
 
         # Need to draw x before drawing log_x
@@ -98,7 +98,7 @@ def test_draw_point_replacement(self):
         with pm.Model():
             mu = pm.Normal('mu', mu=0., tau=1e-3)
             sigma = pm.Gamma('sigma', alpha=1., beta=1., transform=None)
-            y = pm.Normal('y', mu=mu, sd=sigma)
+            y = pm.Normal('y', mu=mu, sigma=sigma)
             mu2, tau2 = draw_values([y.distribution.mu, y.distribution.tau],
                                                      point={'mu': 5., 'sigma': 2.})
         npt.assert_almost_equal(mu2, 5)
@@ -108,7 +108,7 @@ def test_random_sample_returns_nd_array(self):
         with pm.Model():
             mu = pm.Normal('mu', mu=0., tau=1e-3)
             sigma = pm.Gamma('sigma', alpha=1., beta=1., transform=None)
-            y = pm.Normal('y', mu=mu, sd=sigma)
+            y = pm.Normal('y', mu=mu, sigma=sigma)
             mu, tau = draw_values([y.distribution.mu, y.distribution.tau])
         assert isinstance(mu, np.ndarray)
         assert isinstance(tau, np.ndarray)
@@ -221,7 +221,7 @@ class TestTruncatedNormal(BaseTestCases.BaseTestCase):
 
 class TestSkewNormal(BaseTestCases.BaseTestCase):
     distribution = pm.SkewNormal
-    params = {'mu': 0., 'sd': 1., 'alpha': 5.}
+    params = {'mu': 0., 'sigma': 1., 'alpha': 5.}
 
 
 class TestHalfNormal(BaseTestCases.BaseTestCase):
@@ -331,7 +331,7 @@ class TestLogistic(BaseTestCases.BaseTestCase):
 
 class TestLogitNormal(BaseTestCases.BaseTestCase):
     distribution = pm.LogitNormal
-    params = {'mu': 0., 'sd': 1.}
+    params = {'mu': 0., 'sigma': 1.}
 
 
 class TestBinomial(BaseTestCases.BaseTestCase):
@@ -421,20 +421,20 @@ def ref_rand(size, lower, upper):
         pymc3_random(pm.Uniform, {'lower': -Rplus, 'upper': Rplus}, ref_rand=ref_rand)
 
     def test_normal(self):
-        def ref_rand(size, mu, sd):
-            return st.norm.rvs(size=size, loc=mu, scale=sd)
-        pymc3_random(pm.Normal, {'mu': R, 'sd': Rplus}, ref_rand=ref_rand)
+        def ref_rand(size, mu, sigma):
+            return st.norm.rvs(size=size, loc=mu, scale=sigma)
+        pymc3_random(pm.Normal, {'mu': R, 'sigma': Rplus}, ref_rand=ref_rand)
 
     def test_truncated_normal(self):
-        def ref_rand(size, mu, sd, lower, upper):
-            return st.truncnorm.rvs((lower-mu)/sd, (upper-mu)/sd, size=size, loc=mu, scale=sd)
-        pymc3_random(pm.TruncatedNormal, {'mu': R, 'sd': Rplusbig, 'lower':-Rplusbig, 'upper':Rplusbig},
+        def ref_rand(size, mu, sigma, lower, upper):
+            return st.truncnorm.rvs((lower-mu)/sigma, (upper-mu)/sigma, size=size, loc=mu, scale=sigma)
+        pymc3_random(pm.TruncatedNormal, {'mu': R, 'sigma': Rplusbig, 'lower':-Rplusbig, 'upper':Rplusbig},
                      ref_rand=ref_rand)
 
     def test_skew_normal(self):
-        def ref_rand(size, alpha, mu, sd):
-            return st.skewnorm.rvs(size=size, a=alpha, loc=mu, scale=sd)
-        pymc3_random(pm.SkewNormal, {'mu': R, 'sd': Rplus, 'alpha': R}, ref_rand=ref_rand)
+        def ref_rand(size, alpha, mu, sigma):
+            return st.skewnorm.rvs(size=size, a=alpha, loc=mu, scale=sigma)
+        pymc3_random(pm.SkewNormal, {'mu': R, 'sigma': Rplus, 'alpha': R}, ref_rand=ref_rand)
 
     def test_half_normal(self):
         def ref_rand(size, tau):
@@ -491,10 +491,10 @@ def ref_rand(size, alpha, beta):
             return st.gamma.rvs(alpha, scale=1. / beta, size=size)
         pymc3_random(pm.Gamma, {'alpha': Rplusbig, 'beta': Rplusbig}, ref_rand=ref_rand)
 
-    def test_gamma_mu_sd(self):
-        def ref_rand(size, mu, sd):
-            return st.gamma.rvs(mu**2 / sd**2, scale=sd ** 2 / mu, size=size)
-        pymc3_random(pm.Gamma, {'mu': Rplusbig, 'sd': Rplusbig}, ref_rand=ref_rand)
+    def test_gamma_mu_sigma(self):
+        def ref_rand(size, mu, sigma):
+            return st.gamma.rvs(mu**2 / sigma**2, scale=sigma ** 2 / mu, size=size)
+        pymc3_random(pm.Gamma, {'mu': Rplusbig, 'sigma': Rplusbig}, ref_rand=ref_rand)
 
     def test_inverse_gamma(self):
         def ref_rand(size, alpha, beta):
@@ -727,24 +727,28 @@ def ref_rand(size, mu, s):
         pymc3_random(pm.Logistic, {'mu': R, 's': Rplus}, ref_rand=ref_rand)
 
     def test_logitnormal(self):
-        def ref_rand(size, mu, sd):
-            return expit(st.norm.rvs(loc=mu, scale=sd, size=size))
-        pymc3_random(pm.LogitNormal, {'mu': R, 'sd': Rplus}, ref_rand=ref_rand)
+        def ref_rand(size, mu, sigma):
+            return expit(st.norm.rvs(loc=mu, scale=sigma, size=size))
+        pymc3_random(pm.LogitNormal, {'mu': R, 'sigma': Rplus}, ref_rand=ref_rand)
 
     @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
     def test_interpolated(self):
         for mu in R.vals:
-            for sd in Rplus.vals:
+            for sigma in Rplus.vals:
                 #pylint: disable=cell-var-from-loop
                 def ref_rand(size):
-                    return st.norm.rvs(loc=mu, scale=sd, size=size)
+                    return st.norm.rvs(loc=mu, scale=sigma, size=size)
 
                 class TestedInterpolated (pm.Interpolated):
 
                     def __init__(self, **kwargs):
-                        x_points = np.linspace(mu - 5 * sd, mu + 5 * sd, 100)
-                        pdf_points = st.norm.pdf(x_points, loc=mu, scale=sd)
-                        super().__init__(x_points=x_points, pdf_points=pdf_points, **kwargs)
+                        x_points = np.linspace(mu - 5 * sigma, mu + 5 * sigma, 100)
+                        pdf_points = st.norm.pdf(x_points, loc=mu, scale=sigma)
+                        super().__init__(
+                            x_points=x_points,
+                            pdf_points=pdf_points,
+                            **kwargs
+                        )
 
                 pymc3_random(TestedInterpolated, {}, ref_rand=ref_rand)
 
@@ -782,19 +786,19 @@ def __init__(self, **kwargs):
                      ref_rand=ref_rand)
 
     def test_normalmixture(self):
-        def ref_rand(size, w, mu, sd):
+        def ref_rand(size, w, mu, sigma):
             component = np.random.choice(w.size, size=size, p=w)
-            return np.random.normal(mu[component], sd[component], size=size)
+            return np.random.normal(mu[component], sigma[component], size=size)
 
         pymc3_random(pm.NormalMixture, {'w': Simplex(2),
                      'mu': Domain([[.05, 2.5], [-5., 1.]], edges=(None, None)),
-                     'sd': Domain([[1, 1], [1.5, 2.]], edges=(None, None))},
+                     'sigma': Domain([[1, 1], [1.5, 2.]], edges=(None, None))},
                      extra_args={'comp_shape': 2},
                      size=1000,
                      ref_rand=ref_rand)
         pymc3_random(pm.NormalMixture, {'w': Simplex(3),
                      'mu': Domain([[-5., 1., 2.5]], edges=(None, None)),
-                     'sd': Domain([[1.5, 2., 3.]], edges=(None, None))},
+                     'sigma': Domain([[1.5, 2., 3.]], edges=(None, None))},
                      extra_args={'comp_shape': 3},
                      size=1000,
                      ref_rand=ref_rand)
diff --git a/pymc3/tests/test_distributions_timeseries.py b/pymc3/tests/test_distributions_timeseries.py
index 906f456862..f8679d7f1b 100644
--- a/pymc3/tests/test_distributions_timeseries.py
+++ b/pymc3/tests/test_distributions_timeseries.py
@@ -11,8 +11,8 @@ def test_AR():
     data = np.array([0.3,1,2,3,4])
     phi = np.array([0.99])
     with Model() as t:
-        y = AR('y', phi, sd=1, shape=len(data))
-        z = Normal('z', mu=phi*data[:-1], sd=1, shape=len(data)-1)
+        y = AR('y', phi, sigma=1, shape=len(data))
+        z = Normal('z', mu=phi*data[:-1], sigma=1, shape=len(data)-1)
     ar_like = t['y'].logp({'z':data[1:], 'y': data})
     reg_like = t['z'].logp({'z':data[1:], 'y': data})
     np.testing.assert_allclose(ar_like, reg_like)
@@ -27,8 +27,8 @@ def test_AR():
 
     # AR1 + constant
     with Model() as t:
-        y = AR('y', [0.3, phi], sd=1, shape=len(data), constant=True)
-        z = Normal('z', mu=0.3 + phi*data[:-1], sd=1, shape=len(data)-1)
+        y = AR('y', [0.3, phi], sigma=1, shape=len(data), constant=True)
+        z = Normal('z', mu=0.3 + phi*data[:-1], sigma=1, shape=len(data)-1)
     ar_like = t['y'].logp({'z':data[1:], 'y': data})
     reg_like = t['z'].logp({'z':data[1:], 'y': data})
     np.testing.assert_allclose(ar_like, reg_like)
@@ -36,8 +36,8 @@ def test_AR():
     # AR2
     phi = np.array([0.84, 0.10])
     with Model() as t:
-        y = AR('y', phi, sd=1, shape=len(data))
-        z = Normal('z', mu=phi[0]*data[1:-1]+phi[1]*data[:-2], sd=1, shape=len(data)-2)
+        y = AR('y', phi, sigma=1, shape=len(data))
+        z = Normal('z', mu=phi[0]*data[1:-1]+phi[1]*data[:-2], sigma=1, shape=len(data)-2)
     ar_like = t['y'].logp({'z':data[2:], 'y': data})
     reg_like = t['z'].logp({'z':data[2:], 'y': data})
     np.testing.assert_allclose(ar_like, reg_like)
@@ -52,7 +52,7 @@ def test_AR_nd():
         beta = Normal('beta', 0., 1.,
                       shape=(p, n),
                       testval=beta_tp)
-        AR('y', beta, sd=1.0,
+        AR('y', beta, sigma=1.0,
            shape=(T, n), testval=y_tp)
 
     with Model() as t1:
@@ -60,7 +60,7 @@ def test_AR_nd():
                       shape=(p, n),
                       testval=beta_tp)
         for i in range(n):
-            AR('y_%d' % i, beta[:, i], sd=1.0,
+            AR('y_%d' % i, beta[:, i], sigma=1.0,
                shape=T, testval=y_tp[:, i])
 
     np.testing.assert_allclose(t0.logp(t0.test_point),
@@ -85,7 +85,7 @@ def test_GARCH11():
     with Model() as t:
         y = GARCH11('y', omega=omega, alpha_1=alpha_1, beta_1=beta_1,
                     initial_vol=initial_vol, shape=data.shape)
-        z = Normal('z', mu=0, sd=vol, shape=data.shape)
+        z = Normal('z', mu=0, sigma=vol, shape=data.shape)
     garch_like = t['y'].logp({'z':data, 'y': data})
     reg_like = t['z'].logp({'z':data, 'y': data})
     np.testing.assert_allclose(garch_like, reg_like)
@@ -115,7 +115,7 @@ def test_linear():
     with Model() as model:
         lamh = Flat('lamh')
         xh = EulerMaruyama('xh', dt, sde, (lamh,), shape=N + 1, testval=x)
-        Normal('zh', mu=xh, sd=sig2, observed=z)
+        Normal('zh', mu=xh, sigma=sig2, observed=z)
     # invert
     with model:
         trace = sample(init='advi+adapt_diag', chains=1)
diff --git a/pymc3/tests/test_examples.py b/pymc3/tests/test_examples.py
index 7595b0964a..29c8a8645c 100644
--- a/pymc3/tests/test_examples.py
+++ b/pymc3/tests/test_examples.py
@@ -43,7 +43,7 @@ def build_model(self):
         P['1'] = 1
 
         with pm.Model() as model:
-            effects = pm.Normal('effects', mu=0, sd=100, shape=len(P.columns))
+            effects = pm.Normal('effects', mu=0, sigma=100, shape=len(P.columns))
             logit_p = tt.dot(floatX(np.array(P)), effects)
             pm.Bernoulli('s', logit_p=logit_p, observed=floatX(data.switch.values))
         return model
diff --git a/pymc3/tests/test_glm.py b/pymc3/tests/test_glm.py
index 7d8dfa0806..da1a8ef611 100644
--- a/pymc3/tests/test_glm.py
+++ b/pymc3/tests/test_glm.py
@@ -37,7 +37,7 @@ def test_linear_component(self):
         with Model() as model:
             lm = LinearComponent.from_formula('y ~ x', self.data_linear)
             sigma = Uniform('sigma', 0, 20)
-            Normal('y_obs', mu=lm.y_est, sd=sigma, observed=self.y_linear)
+            Normal('y_obs', mu=lm.y_est, sigma=sigma, observed=self.y_linear)
             start = find_MAP(vars=[sigma])
             step = Slice(model.vars)
             trace = sample(500, tune=0, step=step, start=start,
diff --git a/pymc3/tests/test_hmc.py b/pymc3/tests/test_hmc.py
index 475b657d20..95d2c58b3b 100644
--- a/pymc3/tests/test_hmc.py
+++ b/pymc3/tests/test_hmc.py
@@ -35,7 +35,7 @@ def test_leapfrog_reversible():
 def test_nuts_tuning():
     model = pymc3.Model()
     with model:
-        pymc3.Normal("mu", mu=0, sd=1)
+        pymc3.Normal("mu", mu=0, sigma=1)
         step = pymc3.NUTS()
         trace = pymc3.sample(10, step=step, tune=5, progressbar=False, chains=1)
 
@@ -46,8 +46,8 @@ def test_nuts_error_reporting(caplog):
     model = pymc3.Model()
     with caplog.at_level(logging.CRITICAL) and pytest.raises(SamplingError):
         with model:
-            pymc3.HalfNormal('a', sd=1, transform=None, testval=-1)
-            pymc3.HalfNormal('b', sd=1, transform=None)
+            pymc3.HalfNormal('a', sigma=1, transform=None, testval=-1)
+            pymc3.HalfNormal('b', sigma=1, transform=None)
             trace = pymc3.sample(init='adapt_diag', chains=1)
         assert "Bad initial energy, check any log  probabilities that are inf or -inf: a        -inf\nb" in caplog.text
 
diff --git a/pymc3/tests/test_mixture.py b/pymc3/tests/test_mixture.py
index d725b58aa9..54be0e6179 100644
--- a/pymc3/tests/test_mixture.py
+++ b/pymc3/tests/test_mixture.py
@@ -170,11 +170,11 @@ def test_mixture_of_mixture(self):
             # mixtures components
             g_comp = Normal.dist(
                 mu=Exponential('mu_g', lam=1.0, shape=nbr, transform=None),
-                sd=1,
+                sigma=1,
                 shape=nbr)
             l_comp = Lognormal.dist(
                 mu=Exponential('mu_l', lam=1.0, shape=nbr, transform=None),
-                sd=1,
+                sigma=1,
                 shape=nbr)
             # weight vector for the mixtures
             g_w = Dirichlet('g_w', a=floatX(np.ones(nbr)*0.0000001), transform=None)
diff --git a/pymc3/tests/test_model.py b/pymc3/tests/test_model.py
index 422f3322df..a0d9d3d228 100644
--- a/pymc3/tests/test_model.py
+++ b/pymc3/tests/test_model.py
@@ -17,7 +17,7 @@ def __init__(self, name='', model=None):
         assert pm.modelcontext(None) is self
         # 1) init variables with Var method
         self.Var('v1', pm.Normal.dist())
-        self.v2 = pm.Normal('v2', mu=0, sd=1)
+        self.v2 = pm.Normal('v2', mu=0, sigma=1)
         # 2) Potentials and Deterministic variables with method too
         # be sure that names will not overlap with other same models
         pm.Deterministic('d', tt.constant(1))
@@ -25,11 +25,11 @@ def __init__(self, name='', model=None):
 
 
 class DocstringModel(pm.Model):
-    def __init__(self, mean=0, sd=1, name='', model=None):
+    def __init__(self, mean=0, sigma=1, name='', model=None):
         super().__init__(name, model)
-        self.Var('v1', Normal.dist(mu=mean, sd=sd))
-        Normal('v2', mu=mean, sd=sd)
-        Normal('v3', mu=mean, sd=HalfCauchy('sd', beta=10, testval=1.))
+        self.Var('v1', Normal.dist(mu=mean, sigma=sigma))
+        Normal('v2', mu=mean, sigma=sigma)
+        Normal('v3', mu=mean, sigma=HalfCauchy('sd', beta=10, testval=1.))
         Deterministic('v3_sq', self.v3 ** 2)
         Potential('p1', tt.constant(1))
 
diff --git a/pymc3/tests/test_model_graph.py b/pymc3/tests/test_model_graph.py
index c261410cc9..df4c4c0bba 100644
--- a/pymc3/tests/test_model_graph.py
+++ b/pymc3/tests/test_model_graph.py
@@ -21,14 +21,14 @@ def radon_model():
         ))
     with pm.Model() as model:
         sigma_a = pm.HalfCauchy('sigma_a', 5)
-        gamma = pm.Normal('gamma', mu=0., sd=1e5, shape=3)
+        gamma = pm.Normal('gamma', mu=0., sigma=1e5, shape=3)
         mu_a = pm.Deterministic('mu_a', gamma[0] + gamma[1]*uranium + gamma[2]*xbar)
-        eps_a = pm.Normal('eps_a', mu=0, sd=sigma_a, shape=counties)
+        eps_a = pm.Normal('eps_a', mu=0, sigma=sigma_a, shape=counties)
         a = pm.Deterministic('a', mu_a + eps_a[county])
-        b = pm.Normal('b', mu=0., sd=1e15)
+        b = pm.Normal('b', mu=0., sigma=1e15)
         sigma_y = pm.Uniform('sigma_y', lower=0, upper=100)
         y_hat = a + b * floor_measure
-        y_like = pm.Normal('y_like', mu=y_hat, sd=sigma_y, observed=log_radon)
+        y_like = pm.Normal('y_like', mu=y_hat, sigma=sigma_y, observed=log_radon)
 
     compute_graph = {
         'sigma_a': set(),
diff --git a/pymc3/tests/test_model_helpers.py b/pymc3/tests/test_model_helpers.py
index 9383f6723f..75e0a5313f 100644
--- a/pymc3/tests/test_model_helpers.py
+++ b/pymc3/tests/test_model_helpers.py
@@ -98,7 +98,7 @@ def test_as_tensor(self):
         # Create a fake model and fake distribution to be used for the test
         fake_model = pm.Model()
         with fake_model:
-            fake_distribution = pm.Normal.dist(mu=0, sd=1)
+            fake_distribution = pm.Normal.dist(mu=0, sigma=1)
             # Create the testval attribute simply for the sake of model testing
             fake_distribution.testval = None
 
diff --git a/pymc3/tests/test_models_linear.py b/pymc3/tests/test_models_linear.py
index d6799d7188..54242bc33f 100644
--- a/pymc3/tests/test_models_linear.py
+++ b/pymc3/tests/test_models_linear.py
@@ -43,7 +43,7 @@ def test_linear_component(self):
                 name='lm'
             )   # yields lm_x0, lm_Intercept
             sigma = Uniform('sigma', 0, 20)     # yields sigma_interval__
-            Normal('y_obs', mu=lm.y_est, sd=sigma, observed=self.y_linear)  # yields y_obs
+            Normal('y_obs', mu=lm.y_est, sigma=sigma, observed=self.y_linear)  # yields y_obs
             start = find_MAP(vars=[sigma])
             step = Slice(model.vars)
             trace = sample(500, tune=0, step=step, start=start,
@@ -58,7 +58,7 @@ def test_linear_component_from_formula(self):
         with Model() as model:
             lm = LinearComponent.from_formula('y ~ x', self.data_linear)
             sigma = Uniform('sigma', 0, 20)
-            Normal('y_obs', mu=lm.y_est, sd=sigma, observed=self.y_linear)
+            Normal('y_obs', mu=lm.y_est, sigma=sigma, observed=self.y_linear)
             start = find_MAP(vars=[sigma])
             step = Slice(model.vars)
             trace = sample(500, tune=0, step=step, start=start,
diff --git a/pymc3/tests/test_quadpotential.py b/pymc3/tests/test_quadpotential.py
index 505cdccf67..b7cc9e1777 100644
--- a/pymc3/tests/test_quadpotential.py
+++ b/pymc3/tests/test_quadpotential.py
@@ -122,7 +122,7 @@ def test_random_dense():
 def test_user_potential():
     model = pymc3.Model()
     with model:
-        pymc3.Normal("a", mu=0, sd=1)
+        pymc3.Normal("a", mu=0, sigma=1)
 
     # Work around missing nonlocal in python2
     called = []
diff --git a/pymc3/tests/test_random.py b/pymc3/tests/test_random.py
index 8c23edb55a..2c005193af 100644
--- a/pymc3/tests/test_random.py
+++ b/pymc3/tests/test_random.py
@@ -33,7 +33,7 @@ def test_draw_value():
 
     with pm.Model():
         mu = 2 * tt.constant(np.array([5., 6.])) + theano.shared(np.array(5))
-        a = pm.Normal('a', mu=mu, sd=5, shape=2)
+        a = pm.Normal('a', mu=mu, sigma=5, shape=2)
 
     val1 = _draw_value(a)
     val2 = _draw_value(a)
@@ -63,7 +63,7 @@ def test_vals(self):
     def test_simple_model(self):
         with pm.Model():
             mu = 2 * tt.constant(np.array([5., 6.])) + theano.shared(np.array(5))
-            a = pm.Normal('a', mu=mu, sd=5, shape=2)
+            a = pm.Normal('a', mu=mu, sigma=5, shape=2)
 
         val1 = draw_values([a])
         val2 = draw_values([a])
@@ -95,9 +95,9 @@ def test_dep_vars(self):
 class TestJointDistributionDrawValues(SeededTest):
     def test_joint_distribution(self):
         with pm.Model() as model:
-            a = pm.Normal('a', mu=0, sd=100)
-            b = pm.Normal('b', mu=a, sd=1e-8)
-            c = pm.Normal('c', mu=a, sd=1e-8)
+            a = pm.Normal('a', mu=0, sigma=100)
+            b = pm.Normal('b', mu=a, sigma=1e-8)
+            c = pm.Normal('c', mu=a, sigma=1e-8)
             d = pm.Deterministic('d', b + c)
 
         # Expected RVs
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index 02340b8bd2..85e479657e 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -143,8 +143,8 @@ def test_empty_model():
 
 def test_partial_trace_sample():
     with pm.Model() as model:
-        a = pm.Normal('a', mu=0, sd=1)
-        b = pm.Normal('b', mu=0, sd=1)
+        a = pm.Normal('a', mu=0, sigma=1)
+        b = pm.Normal('b', mu=0, sigma=1)
         trace = pm.sample(trace=[a])
 
 
@@ -214,7 +214,7 @@ class TestSamplePPC(SeededTest):
     def test_normal_scalar(self):
         with pm.Model() as model:
             mu = pm.Normal('mu', 0., 1.)
-            a = pm.Normal('a', mu=mu, sd=1, observed=0.)
+            a = pm.Normal('a', mu=mu, sigma=1, observed=0.)
             trace = pm.sample()
 
         with model:
@@ -236,7 +236,7 @@ def test_normal_scalar(self):
     def test_normal_vector(self):
         with pm.Model() as model:
             mu = pm.Normal('mu', 0., 1.)
-            a = pm.Normal('a', mu=mu, sd=1,
+            a = pm.Normal('a', mu=mu, sigma=1,
                           observed=np.array([.5, .2]))
             trace = pm.sample()
 
@@ -255,8 +255,8 @@ def test_normal_vector(self):
 
     def test_vector_observed(self):
         with pm.Model() as model:
-            mu = pm.Normal('mu', mu=0, sd=1)
-            a = pm.Normal('a', mu=mu, sd=1,
+            mu = pm.Normal('mu', mu=0, sigma=1)
+            a = pm.Normal('a', mu=mu, sigma=1,
                           observed=np.array([0., 1.]))
             trace = pm.sample()
 
@@ -275,7 +275,7 @@ def test_vector_observed(self):
 
     def test_sum_normal(self):
         with pm.Model() as model:
-            a = pm.Normal('a', sd=0.2)
+            a = pm.Normal('a', sigma=0.2)
             b = pm.Normal('b', mu=a)
             trace = pm.sample()
 
@@ -310,13 +310,13 @@ def test_sample_posterior_predictive_w(self):
         data0 = np.random.normal(0, 1, size=500)
 
         with pm.Model() as model_0:
-            mu = pm.Normal('mu', mu=0, sd=1)
-            y = pm.Normal('y', mu=mu, sd=1, observed=data0)
+            mu = pm.Normal('mu', mu=0, sigma=1)
+            y = pm.Normal('y', mu=mu, sigma=1, observed=data0)
             trace_0 = pm.sample()
 
         with pm.Model() as model_1:
-            mu = pm.Normal('mu', mu=0, sd=1, shape=len(data0))
-            y = pm.Normal('y', mu=mu, sd=1, observed=data0)
+            mu = pm.Normal('mu', mu=0, sigma=1, shape=len(data0))
+            y = pm.Normal('y', mu=mu, sigma=1, observed=data0)
             trace_1 = pm.sample()
 
         traces = [trace_0, trace_0]
@@ -336,8 +336,8 @@ def test_sample_posterior_predictive_w(self):
 ])
 def test_exec_nuts_init(method):
     with pm.Model() as model:
-        pm.Normal('a', mu=0, sd=1, shape=2)
-        pm.HalfNormal('b', sd=1)
+        pm.Normal('a', mu=0, sigma=1, shape=2)
+        pm.HalfNormal('b', sigma=1)
     with model:
         start, _ = pm.init_nuts(init=method, n_init=10)
         assert isinstance(start, list)
@@ -355,10 +355,10 @@ def test_ignores_observed(self):
         observed = np.random.normal(10, 1, size=200)
         with pm.Model():
             # Use a prior that's way off to show we're ignoring the observed variables
-            mu = pm.Normal('mu', mu=-100, sd=1)
+            mu = pm.Normal('mu', mu=-100, sigma=1)
             positive_mu = pm.Deterministic('positive_mu', np.abs(mu))
             z = -1 - positive_mu
-            pm.Normal('x_obs', mu=z, sd=1, observed=observed)
+            pm.Normal('x_obs', mu=z, sigma=1, observed=observed)
             prior = pm.sample_prior_predictive()
 
         assert (prior['mu'] < 90).all()
@@ -464,7 +464,7 @@ def test_shape_edgecase(self):
         with pm.Model():
             mu = pm.Normal('mu', shape=5)
             sd = pm.Uniform('sd', lower=2, upper=3)
-            x = pm.Normal('x', mu=mu, sd=sd, shape=5)
+            x = pm.Normal('x', mu=mu, sigma=sd, shape=5)
             prior = pm.sample_prior_predictive(10)
         assert prior['mu'].shape == (10, 5)
 
diff --git a/pymc3/tests/test_sgfs.py b/pymc3/tests/test_sgfs.py
index 06d8749443..e1120e60fc 100644
--- a/pymc3/tests/test_sgfs.py
+++ b/pymc3/tests/test_sgfs.py
@@ -22,7 +22,7 @@ def f(x, a, b, c):
     y_obs = pm.data.Minibatch(y_train, batch_size=batch_size)
 
     with Model():
-        abc = Normal('abc', mu=mu0, sd=sd0, shape=(3,))
+        abc = Normal('abc', mu=mu0, sigma=sd0, shape=(3,))
         x = x_obs
         x2 = x**2
         o = tt.ones_like(x)
diff --git a/pymc3/tests/test_stats.py b/pymc3/tests/test_stats.py
index dd71a0208e..a3d1148e42 100644
--- a/pymc3/tests/test_stats.py
+++ b/pymc3/tests/test_stats.py
@@ -55,12 +55,12 @@ def test_compare():
 
     with pm.Model() as model0:
         mu = pm.Normal('mu', 0, 1)
-        x = pm.Normal('x', mu=mu, sd=1, observed=x_obs)
+        x = pm.Normal('x', mu=mu, sigma=1, observed=x_obs)
         trace0 = pm.sample(1000)
 
     with pm.Model() as model1:
         mu = pm.Normal('mu', 0, 1)
-        x = pm.Normal('x', mu=mu, sd=0.8, observed=x_obs)
+        x = pm.Normal('x', mu=mu, sigma=0.8, observed=x_obs)
         trace1 = pm.sample(1000)
 
     with pm.Model() as model2:
diff --git a/pymc3/tests/test_step.py b/pymc3/tests/test_step.py
index 8fc5ca3153..974acf3418 100644
--- a/pymc3/tests/test_step.py
+++ b/pymc3/tests/test_step.py
@@ -702,8 +702,8 @@ def check_trace(self, step_method):
         """
         n_steps = 100
         with Model() as model:
-            x = Normal("x", mu=0, sd=1)
-            y = Normal("y", mu=x, sd=1, observed=1)
+            x = Normal("x", mu=0, sigma=1)
+            y = Normal("y", mu=x, sigma=1, observed=1)
             if step_method.__name__ == "SMC":
                 trace = sample(
                     draws=200, random_seed=1, progressbar=False, step=step_method()
@@ -925,7 +925,7 @@ def kill_grad(x):
 
             data = np.random.normal(size=(100,))
             Normal(
-                "y", mu=kill_grad(x), sd=1, observed=data.astype(theano.config.floatX)
+                "y", mu=kill_grad(x), sigma=1, observed=data.astype(theano.config.floatX)
             )
 
             steps = assign_step_methods(model, [])
@@ -939,7 +939,7 @@ class TestPopulationSamplers:
     def test_checks_population_size(self):
         """Test that population samplers check the population size."""
         with Model() as model:
-            n = Normal("n", mu=0, sd=1)
+            n = Normal("n", mu=0, sigma=1)
             for stepper in TestPopulationSamplers.steppers:
                 step = stepper()
                 with pytest.raises(ValueError):
@@ -978,7 +978,7 @@ def test_multiple_samplers(self, caplog):
 
     def test_bad_init_nonparallel(self):
         with Model():
-            HalfNormal("a", sd=1, testval=-1, transform=None)
+            HalfNormal("a", sigma=1, testval=-1, transform=None)
             with pytest.raises(SamplingError) as error:
                 sample(init=None, chains=1, random_seed=1)
             error.match("Bad initial")
@@ -987,7 +987,7 @@ def test_bad_init_nonparallel(self):
                     reason="requires python3.6 or higher")
     def test_bad_init_parallel(self):
         with Model():
-            HalfNormal("a", sd=1, testval=-1, transform=None)
+            HalfNormal("a", sigma=1, testval=-1, transform=None)
             with pytest.raises(ParallelSamplingError) as error:
                 sample(init=None, cores=2, random_seed=1)
             error.match("Bad initial")
@@ -1016,7 +1016,7 @@ def test_linalg(self, caplog):
 
     def test_sampler_stats(self):
         with Model() as model:
-            x = Normal("x", mu=0, sd=1)
+            x = Normal("x", mu=0, sigma=1)
             trace = sample(draws=10, tune=1, chains=1)
 
         # Assert stats exist and have the correct shape.
diff --git a/pymc3/tests/test_text_backend.py b/pymc3/tests/test_text_backend.py
index e0e1d6a3d8..550e2ec42e 100644
--- a/pymc3/tests/test_text_backend.py
+++ b/pymc3/tests/test_text_backend.py
@@ -10,7 +10,7 @@ class TestTextSampling:
 
     def test_supports_sampler_stats(self):
         with pm.Model():
-            pm.Normal("mu", mu=0, sd=1, shape=2)
+            pm.Normal("mu", mu=0, sigma=1, shape=2)
             db = text.Text(self.name)
             pm.sample(20, tune=10, init=None, trace=db, cores=2)
 
diff --git a/pymc3/tests/test_types.py b/pymc3/tests/test_types.py
index 2b21a81e8f..6a3967711f 100644
--- a/pymc3/tests/test_types.py
+++ b/pymc3/tests/test_types.py
@@ -26,7 +26,7 @@ def teardown_method(self):
     def test_float64(self):
         with Model() as model:
             x = Normal('x', testval=np.array(1., dtype='float64'))
-            obs = Normal('obs', mu=x, sd=1., observed=np.random.randn(5))
+            obs = Normal('obs', mu=x, sigma=1., observed=np.random.randn(5))
 
         assert x.dtype == 'float64'
         assert obs.dtype == 'float64'
@@ -39,7 +39,7 @@ def test_float64(self):
     def test_float32(self):
         with Model() as model:
             x = Normal('x', testval=np.array(1., dtype='float32'))
-            obs = Normal('obs', mu=x, sd=1., observed=np.random.randn(5).astype('float32'))
+            obs = Normal('obs', mu=x, sigma=1., observed=np.random.randn(5).astype('float32'))
 
         assert x.dtype == 'float32'
         assert obs.dtype == 'float32'
diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
index d93d0ea24b..c6a199c0c0 100644
--- a/pymc3/tests/test_variational_inference.py
+++ b/pymc3/tests/test_variational_inference.py
@@ -253,10 +253,10 @@ def test_vae():
 
     with pm.Model():
         # Hidden variables
-        zs = pm.Normal('zs', mu=0, sd=1, shape=minibatch_size)
+        zs = pm.Normal('zs', mu=0, sigma=1, shape=minibatch_size)
         dec = zs * ad + bd
         # Observation model
-        pm.Normal('xs_', mu=dec, sd=0.1, observed=x_inp)
+        pm.Normal('xs_', mu=dec, sigma=0.1, observed=x_inp)
 
         pm.fit(1, local_rv={zs: dict(mu=mu, rho=rho)},
                more_replacements={x_inp: x_mini}, more_obj_params=[ae, be, ad, bd])
@@ -434,11 +434,11 @@ def test_elbo():
     y_obs = np.array([1.6, 1.4])
 
     post_mu = np.array([1.88], dtype=theano.config.floatX)
-    post_sd = np.array([1], dtype=theano.config.floatX)
+    post_sigma = np.array([1], dtype=theano.config.floatX)
     # Create a model for test
     with pm.Model() as model:
-        mu = pm.Normal('mu', mu=mu0, sd=sigma)
-        pm.Normal('y', mu=mu, sd=1, observed=y_obs)
+        mu = pm.Normal('mu', mu=mu0, sigma=sigma)
+        pm.Normal('y', mu=mu, sigma=1, observed=y_obs)
 
     # Create variational gradient tensor
     mean_field = MeanField(model=model)
@@ -446,7 +446,7 @@ def test_elbo():
         elbo = -pm.operators.KL(mean_field)()(10000)
 
     mean_field.shared_params['mu'].set_value(post_mu)
-    mean_field.shared_params['rho'].set_value(np.log(np.exp(post_sd) - 1))
+    mean_field.shared_params['rho'].set_value(np.log(np.exp(post_sigma) - 1))
 
     f = theano.function([], elbo)
     elbo_mc = f()
@@ -469,7 +469,7 @@ def test_scale_cost_to_minibatch_works(aux_total_size):
     y_obs = np.array([1.6, 1.4])
     beta = len(y_obs)/float(aux_total_size)
     post_mu = np.array([1.88], dtype=theano.config.floatX)
-    post_sd = np.array([1], dtype=theano.config.floatX)
+    post_sigma = np.array([1], dtype=theano.config.floatX)
 
     # TODO: theano_config
     # with pm.Model(theano_config=dict(floatX='float64')):
@@ -479,27 +479,27 @@ def test_scale_cost_to_minibatch_works(aux_total_size):
         with pm.Model():
             assert theano.config.floatX == 'float64'
             assert theano.config.warn_float64 == 'ignore'
-            mu = pm.Normal('mu', mu=mu0, sd=sigma)
-            pm.Normal('y', mu=mu, sd=1, observed=y_obs, total_size=aux_total_size)
+            mu = pm.Normal('mu', mu=mu0, sigma=sigma)
+            pm.Normal('y', mu=mu, sigma=1, observed=y_obs, total_size=aux_total_size)
             # Create variational gradient tensor
             mean_field_1 = MeanField()
             assert mean_field_1.scale_cost_to_minibatch
             mean_field_1.shared_params['mu'].set_value(post_mu)
-            mean_field_1.shared_params['rho'].set_value(np.log(np.exp(post_sd) - 1))
+            mean_field_1.shared_params['rho'].set_value(np.log(np.exp(post_sigma) - 1))
 
             with pm.theanof.change_flags(compute_test_value='off'):
                 elbo_via_total_size_scaled = -pm.operators.KL(mean_field_1)()(10000)
 
         with pm.Model():
-            mu = pm.Normal('mu', mu=mu0, sd=sigma)
-            pm.Normal('y', mu=mu, sd=1, observed=y_obs, total_size=aux_total_size)
+            mu = pm.Normal('mu', mu=mu0, sigma=sigma)
+            pm.Normal('y', mu=mu, sigma=1, observed=y_obs, total_size=aux_total_size)
             # Create variational gradient tensor
             mean_field_2 = MeanField()
             assert mean_field_1.scale_cost_to_minibatch
             mean_field_2.scale_cost_to_minibatch = False
             assert not mean_field_2.scale_cost_to_minibatch
             mean_field_2.shared_params['mu'].set_value(post_mu)
-            mean_field_2.shared_params['rho'].set_value(np.log(np.exp(post_sd) - 1))
+            mean_field_2.shared_params['rho'].set_value(np.log(np.exp(post_sigma) - 1))
 
         with pm.theanof.change_flags(compute_test_value='off'):
             elbo_via_total_size_unscaled = -pm.operators.KL(mean_field_2)()(10000)
@@ -518,27 +518,27 @@ def test_elbo_beta_kl(aux_total_size):
     y_obs = np.array([1.6, 1.4])
     beta = len(y_obs)/float(aux_total_size)
     post_mu = np.array([1.88], dtype=theano.config.floatX)
-    post_sd = np.array([1], dtype=theano.config.floatX)
+    post_sigma = np.array([1], dtype=theano.config.floatX)
     with pm.theanof.change_flags(floatX='float64', warn_float64='ignore'):
         with pm.Model():
-            mu = pm.Normal('mu', mu=mu0, sd=sigma)
-            pm.Normal('y', mu=mu, sd=1, observed=y_obs, total_size=aux_total_size)
+            mu = pm.Normal('mu', mu=mu0, sigma=sigma)
+            pm.Normal('y', mu=mu, sigma=1, observed=y_obs, total_size=aux_total_size)
             # Create variational gradient tensor
             mean_field_1 = MeanField()
             mean_field_1.scale_cost_to_minibatch = True
             mean_field_1.shared_params['mu'].set_value(post_mu)
-            mean_field_1.shared_params['rho'].set_value(np.log(np.exp(post_sd) - 1))
+            mean_field_1.shared_params['rho'].set_value(np.log(np.exp(post_sigma) - 1))
 
             with pm.theanof.change_flags(compute_test_value='off'):
                 elbo_via_total_size_scaled = -pm.operators.KL(mean_field_1)()(10000)
 
         with pm.Model():
-            mu = pm.Normal('mu', mu=mu0, sd=sigma)
-            pm.Normal('y', mu=mu, sd=1, observed=y_obs)
+            mu = pm.Normal('mu', mu=mu0, sigma=sigma)
+            pm.Normal('y', mu=mu, sigma=1, observed=y_obs)
             # Create variational gradient tensor
             mean_field_3 = MeanField()
             mean_field_3.shared_params['mu'].set_value(post_mu)
-            mean_field_3.shared_params['rho'].set_value(np.log(np.exp(post_sd) - 1))
+            mean_field_3.shared_params['rho'].set_value(np.log(np.exp(post_sigma) - 1))
 
             with pm.theanof.change_flags(compute_test_value='off'):
                 elbo_via_beta_kl = -pm.operators.KL(mean_field_3, beta=beta)()(10000)
@@ -558,14 +558,14 @@ def use_minibatch(request):
 @pytest.fixture('module')
 def simple_model_data(use_minibatch):
     n = 1000
-    sd0 = 2.
+    sigma0 = 2.
     mu0 = 4.
-    sd = 3.
+    sigma = 3.
     mu = -5.
 
-    data = sd * np.random.randn(n) + mu
-    d = n / sd ** 2 + 1 / sd0 ** 2
-    mu_post = (n * np.mean(data) / sd ** 2 + mu0 / sd0 ** 2) / d
+    data = sigma * np.random.randn(n) + mu
+    d = n / sigma ** 2 + 1 / sigma0 ** 2
+    mu_post = (n * np.mean(data) / sigma ** 2 + mu0 / sigma0 ** 2) / d
     if use_minibatch:
         data = pm.Minibatch(data)
     return dict(
@@ -574,8 +574,8 @@ def simple_model_data(use_minibatch):
         mu_post=mu_post,
         d=d,
         mu0=mu0,
-        sd0=sd0,
-        sd=sd,
+        sigma0=sigma0,
+        sigma=sigma,
     )
 
 
@@ -584,8 +584,8 @@ def simple_model(simple_model_data):
     with pm.Model() as model:
         mu_ = pm.Normal(
             'mu', mu=simple_model_data['mu0'],
-            sd=simple_model_data['sd0'], testval=0)
-        pm.Normal('x', mu=mu_, sd=simple_model_data['sd'],
+            sigma=simple_model_data['sigma0'], testval=0)
+        pm.Normal('x', mu=mu_, sigma=simple_model_data['sigma'],
                   observed=simple_model_data['data'],
                   total_size=simple_model_data['n'])
     return model
@@ -930,9 +930,9 @@ def test_discrete_not_allowed():
     y = np.random.normal(mu_true[z_true], np.ones_like(z_true))
 
     with pm.Model():
-        mu = pm.Normal('mu', mu=0, sd=10, shape=3)
+        mu = pm.Normal('mu', mu=0, sigma=10, shape=3)
         z = pm.Categorical('z', p=tt.ones(3) / 3, shape=len(y))
-        pm.Normal('y_obs', mu=mu[z], sd=1., observed=y)
+        pm.Normal('y_obs', mu=mu[z], sigma=1., observed=y)
         with pytest.raises(opvi.ParametrizationError):
             pm.fit(n=1)  # fails
 
diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index c331ed418d..39be806e43 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -3,7 +3,7 @@
 from theano import tensor as tt
 
 import pymc3 as pm
-from pymc3.distributions.dist_math import rho2sd
+from pymc3.distributions.dist_math import rho2sigma
 from . import opvi
 from pymc3.variational.opvi import Group, Approximation, node_property
 from pymc3.util import update_start_vals
@@ -42,7 +42,7 @@ def rho(self):
 
     @node_property
     def cov(self):
-        var = rho2sd(self.rho)**2
+        var = rho2sigma(self.rho)**2
         if self.batched:
             return batched_diag(var)
         else:
@@ -50,7 +50,7 @@ def cov(self):
 
     @node_property
     def std(self):
-        return rho2sd(self.rho)
+        return rho2sigma(self.rho)
 
     @change_flags(compute_test_value='off')
     def __init_group__(self, group):
@@ -84,14 +84,14 @@ def create_shared_params(self, start=None):
     @node_property
     def symbolic_random(self):
         initial = self.symbolic_initial
-        sd = self.std
+        sigma = self.std
         mu = self.mean
-        return sd * initial + mu
+        return sigma * initial + mu
 
     @node_property
     def symbolic_logq_not_scaled(self):
         z0 = self.symbolic_initial
-        std = rho2sd(self.rho)
+        std = rho2sigma(self.rho)
         logdet = tt.log(std)
         logq = pm.Normal.dist().logp(z0) - logdet
         return logq.sum(range(1, logq.ndim))
diff --git a/pymc3/variational/flows.py b/pymc3/variational/flows.py
index a15ce42e98..fc3b0e4baa 100644
--- a/pymc3/variational/flows.py
+++ b/pymc3/variational/flows.py
@@ -2,7 +2,7 @@
 import theano
 from theano import tensor as tt
 
-from ..distributions.dist_math import rho2sd
+from ..distributions.dist_math import rho2sigma
 from ..theanof import change_flags
 from ..memoize import WithMemoization
 from .opvi import node_property, collect_shared_to_list
@@ -540,7 +540,7 @@ class ScaleFlow(AbstractFlow):
     def __init__(self, rho=None, **kwargs):
         super().__init__(**kwargs)
         rho = self.add_param(rho, 'rho')
-        self.scale = rho2sd(rho)
+        self.scale = rho2sigma(rho)
         self.shared_params = dict(rho=rho)
 
     log_scale = property(lambda self: self.shared_params['log_scale'])